El retraso de los vuelos de un aeropuerto es un problema tanto para los clientes como para los controladores aereos, que tienen que reprogramar los aterrizajes.
El objetivo de este notebook es desarrollar un modelo que permita predecir la probabilidad de que un vuelo se retrase más de 15 minutos, en el aeropuerto JFK de Nueva York.
Para ello se disponen de los datos de los diferentes vuelos que aterrizan en JFK (datos obtenidos del Departamento de transporte de Estados Unidos): https://drive.google.com/file/d/1RjKWadu92BBBBWSkU6xUkqR4oeTz7MJB/view?usp=drive_link
Descargamos los datos y los metemos en la carpeta 'data'.
# Genero las carpetas que se van a utilizar a lo largo del notebook
import os
for i in ['data', 'report', 'pickle', 'final']:
try:
os.mkdir(i)
except FileExistsError:
pass
from IPython.display import Image
Image(url='https://cdn.bitlysdowssl-aws.com/wp-content/uploads/2021/12/Aeropuerto-JFK.jpg')
DataFrameReporter Nos va a permitir hacer un primer contacto con el dataset.\ Se trata de una clase auxiliar que hará una parte del EDA.
class DataFrameReporter(object):
'''
Helper class that reports nulls and datatypes of columns
'''
def __init__(self, df): # Variables que le tengo que suministrar a la clase
'''
Constructor for the class.
Needs dataframe
'''
self.df = df
def analyze_X(self, X):
'''
Analyses the DataFrame you pass and returns a report of nulls, distribution and other goodies.
'''
dtypes = X.dtypes.to_frame().rename(columns = {0:"Dtypes"}) # Tipos de variables
nulls_in_X = X.isnull().sum().to_frame().rename(columns = {0:"Absolute_nulls"}) # Número de nulos en las variables
nulls_in_X["Relative_nulls"] = nulls_in_X["Absolute_nulls"]/X.shape[0]
nulls_in_X["Relative_nulls"] = nulls_in_X["Relative_nulls"].apply(
lambda number: round(number, 3) * 100 # Relative_nulls = (Absolute_nulls/Shape)*100 ... con 3 cifras significativas
)
nulls_in_X = pd.concat([nulls_in_X, dtypes], axis = 1)
nulls_in_X["Shape"] = X.shape[0] # Número de registros
nulls_in_X = nulls_in_X[["Dtypes", "Shape", "Absolute_nulls", "Relative_nulls"]] # Ordeno las columnas
describe_values_num = X.describe().T # Describe de las variables numéricas
report_df = pd.concat([nulls_in_X, describe_values_num], axis = 1)
describe_values_cat = X.describe(exclude = "number").T # Describe de las variables categóricas
report_df = pd.concat([report_df, describe_values_cat], axis = 1)
report_df.fillna("", inplace = True) # Imputo los nulos con un vacío
report_df.sort_values("Dtypes", ascending = True, inplace = True) # Ordeno las variables por tipos
return report_df
def get_reports(self):
'''
Calls analyze_X method and returns report DataFrame for train and test.
'''
report_df = self.analyze_X(X = self.df)
return report_df
# Elimina duplicados
def drop_duplicates (df):
'''
Elimina filas duplicadas del dataset
'''
print(f"{df.shape}")
df_duplicated = df[df.duplicated(keep = False)]
print(f"Se han eliminado {len(df[df.duplicated()])} registros repetidos")
df.drop_duplicates(inplace = True)
print(f"{df.shape}")
return df_duplicated
# Nulos en variables
def nulos_variable (dataframe):
'''
Muestra los nulos que hay
'''
nulos_numericas = dataframe.select_dtypes(include = np.number).isnull().sum()[dataframe.select_dtypes(include = np.number).isnull().sum() > 0]
nulos_categoricas =dataframe.select_dtypes(exclude = np.number).isnull().sum()[dataframe.select_dtypes(exclude = np.number).isnull().sum() > 0]
print ('Las variables numéricas que tienen nulos son:\n', nulos_numericas, '\n\nHay un total de', len(nulos_numericas), 'variables numéricas con nulos')
print ('\n\nLas variables categóricas que tienen nulos son:\n', nulos_categoricas, '\n\nHay un total de', len(nulos_categoricas), 'variables categóricas con nulos')
print ('\n\nVARIABLES TOTALES CON NULOS:', len(nulos_numericas)+len(nulos_categoricas))
# Eliminación de variables numéricas con baja varianza
def varianza_nula (dataframe, std):
'''
Elimina variables numéricas con baja varianza
'''
df_numerical_describe = dataframe.describe(include=np.number).T
lista_numerica_baja_varianza = df_numerical_describe[(df_numerical_describe['std']<std)].index.tolist() # Indico la desviación típica (0.15)
dataframe.drop(lista_numerica_baja_varianza, axis=1, inplace=True)
print('Se han borrado las siguientes variables numéricas por tener baja varianza:\n',lista_numerica_baja_varianza )
return dataframe.head()
# Separación de variables
#target = ['PRECIO'] # indico quién es la variable a predecir
def obtener_lista_variables(dataset):
'''
Obtengo listas de variables
'''
lista_numerica=[]
lista_boolean=[] # numerica y con 2 valores
lista_categorica=[] # object sin datatime
for i in dataset: # si no especifico, aplica a las columnas (es como poner for i in dataset.columns:)
if (dataset[i].dtype.kind == 'i'or dataset[i].dtype.kind == 'f' or dataset[i].dtype.kind == 'b') and len(dataset[i].unique())!=2\
and i not in target:
lista_numerica.append(i)
elif (dataset[i].dtype.kind == 'i'or dataset[i].dtype.kind == 'f' or dataset[i].dtype.kind == 'b') and len(dataset[i].unique()) ==2\
and i not in target:
lista_boolean.append(i)
elif (dataset[i].dtype.kind == 'O') and i not in target: # pongo elif y no else para que no me entre el datetime como categorica
lista_categorica.append(i)
return lista_numerica, lista_boolean, lista_categorica
# Análisis de las variables numéricas VS target
def analisis_numericas (dataframe, column, target):
'''
Pivot-table que me da las medidas de las variables numericas y su correlación con el target
'''
print(f"\nANÁLISIS DE {column} CON RESPECTO AL {target}\n")
return dataframe.pivot_table(index=column, values=target, aggfunc=[len, min, np.mean, max]).sort_values(by=(column),ascending=True)
# Visualización de las variables numéricas VS target
def plot_numericas(dataframe,column,target):
'''
Visualización de las variables numéricas y su correlación con el target
'''
plt.figure(figsize=[8,3])
plt.title(f'Analisis de {column} VS {target}')
sns.boxplot(x = target, y = column, data = dataframe, palette="coolwarm")
return plt.show()
# Análisis de las variables categóricas VS target
def analisis_categoricas (dataframe,column,target):
'''
Pivot-table que me da las medidas de las variables categóricas y su correlación con el target
'''
print(f"\nANÁLISIS DE {column} CON RESPECTO AL {target}\n")
return dataframe.pivot_table(index=column, values=target, aggfunc=[len,sum,np.mean]).sort_values(by=('len', target),ascending=False)
# Visualización de las variables categóricas VS target
def plot_categoricas(dataframe,column,target):
'''
Visualización de las variables categóricas y su correlación con el target
'''
if column == 'TAIL_NUM':
pass
else:
plt.figure(figsize=[20,5])
plt.title(f'Analisis de {column} VS {target}')
plt.xticks(rotation = 90)
sns.countplot(data = dataframe, x = column, hue = target )
return plt.show()
En primer lugar, debemos importar las 4 librerías básicas utilizadas en Machine Learning:
import time
# sistema operativo
import os
# silence warnings
import warnings
warnings.filterwarnings("ignore")
# Descargas de archivos
import wget
# Pandas y Numpy
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# loading ploting libraries
import matplotlib.pyplot as plt
import seaborn as sns
#from matplotlib import pyplot as plt
%matplotlib inline
# visualiza gráficos de diagramas --> DecisionTree
import graphviz
# transformers
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn import __version__ as sklearn_version # versión de sklearn que tengo instalada
# scikits de modelización
from sklearn import model_selection # evaluación de modelos y estrategias de validación de modelos
from sklearn import metrics # métricas de validación de modelos
from sklearn.tree import DecisionTreeClassifier # algoritmo
from sklearn.tree import export_graphviz # exporta el diagrama de arbol en formato DOT (formato con texto plano que se usa para grafos)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # métodos de ensamble
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedKFold, KFold
# config
plt.style.use('ggplot')
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.2f}'.format
# Versiones de las librerías instaladas
print("Working with these versions of libraries\n")
print(f"Numpy version {np.__version__}")
print(f"Pandas version {pd.__version__}")
print(f"Scikit-Learn version {sklearn_version}")
Working with these versions of libraries Numpy version 1.21.5 Pandas version 1.4.4 Scikit-Learn version 1.3.0
#Si se usa Collab
'''from google.colab import drive
drive.mount('/content/drive')'''
"from google.colab import drive\ndrive.mount('/content/drive')"
# Asigno la ruta actual
os.chdir('D:\Data_Science\MACHINE_LEARNING\ML_SUPERVISADO\ML_SUPERVISADO_CLASIFICACION\Machine-Learning-Binary-Classifier-JFK-DELAY')
# Veo la ruta actual
os.getcwd()
'D:\\Data_Science\\MACHINE_LEARNING\\ML_SUPERVISADO\\ML_SUPERVISADO_CLASIFICACION\\Machine-Learning-Binary-Classifier-JFK-DELAY'
Introducimos los datos en un objeto de pandas, un dataframe, de esa forma podemos utilizar todos los métodos y funciones que tiene disponible el dataframe.
Cargamos los datos
%%time
# tiempo que tarda en cargar los datos
jfk = pd.read_csv('data/JFK_Flights_Data.csv')
jfk.head()
Wall time: 4.09 s
| FL_DATE | OP_CARRIER_AIRLINE_ID | TAIL_NUM | ORIGIN | ORIGIN_CITY_NAME | ORIGIN_STATE_NM | DEST | DEST_CITY_NAME | DEST_STATE_NM | CRS_DEP_TIME | TAXI_OUT | CRS_ARR_TIME | ARR_DEL15 | CRS_ELAPSED_TIME | DISTANCE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2018-02-01 | 20,363.00 | N299PQ | BNA | Nashville, TN | Tennessee | JFK | New York, NY | New York | 1359 | 27.00 | 1719 | 0.00 | 140.00 | 765.00 |
| 1 | 2018-02-01 | 20,363.00 | N920XJ | RDU | Raleigh/Durham, NC | North Carolina | JFK | New York, NY | New York | 1354 | 37.00 | 1541 | 0.00 | 107.00 | 427.00 |
| 2 | 2018-02-01 | 20,363.00 | N605LR | BUF | Buffalo, NY | New York | JFK | New York, NY | New York | 1529 | 21.00 | 1709 | 0.00 | 100.00 | 301.00 |
| 3 | 2018-02-01 | 20,363.00 | N800AY | ORF | Norfolk, VA | Virginia | JFK | New York, NY | New York | 1704 | 36.00 | 1845 | 0.00 | 101.00 | 290.00 |
| 4 | 2018-02-01 | 20,363.00 | N600LR | BOS | Boston, MA | Massachusetts | JFK | New York, NY | New York | 1325 | 20.00 | 1454 | 0.00 | 89.00 | 187.00 |
Comprobamos que no haya ningún registro duplicado
drop_duplicates(df = jfk)
(250152, 15) Se han eliminado 0 registros repetidos (250152, 15)
| FL_DATE | OP_CARRIER_AIRLINE_ID | TAIL_NUM | ORIGIN | ORIGIN_CITY_NAME | ORIGIN_STATE_NM | DEST | DEST_CITY_NAME | DEST_STATE_NM | CRS_DEP_TIME | TAXI_OUT | CRS_ARR_TIME | ARR_DEL15 | CRS_ELAPSED_TIME | DISTANCE |
|---|
A.- Data Size
# Tamaño del dataset
jfk.size
3752280
# Número de filas y columnas
jfk.shape
(250152, 15)
# Información resumida del dataset
jfk.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Int64Index: 250152 entries, 0 to 250151 Columns: 15 entries, FL_DATE to DISTANCE dtypes: float64(5), int64(2), object(8) memory usage: 30.5+ MB
B.- Visualización directa de los datos
# Columnas del dataset
jfk.columns
Index(['FL_DATE', 'OP_CARRIER_AIRLINE_ID', 'TAIL_NUM', 'ORIGIN',
'ORIGIN_CITY_NAME', 'ORIGIN_STATE_NM', 'DEST', 'DEST_CITY_NAME',
'DEST_STATE_NM', 'CRS_DEP_TIME', 'TAXI_OUT', 'CRS_ARR_TIME',
'ARR_DEL15', 'CRS_ELAPSED_TIME', 'DISTANCE'],
dtype='object')
# 5 primeros registros
jfk.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| FL_DATE | 2018-02-01 | 2018-02-01 | 2018-02-01 | 2018-02-01 | 2018-02-01 |
| OP_CARRIER_AIRLINE_ID | 20,363.00 | 20,363.00 | 20,363.00 | 20,363.00 | 20,363.00 |
| TAIL_NUM | N299PQ | N920XJ | N605LR | N800AY | N600LR |
| ORIGIN | BNA | RDU | BUF | ORF | BOS |
| ORIGIN_CITY_NAME | Nashville, TN | Raleigh/Durham, NC | Buffalo, NY | Norfolk, VA | Boston, MA |
| ORIGIN_STATE_NM | Tennessee | North Carolina | New York | Virginia | Massachusetts |
| DEST | JFK | JFK | JFK | JFK | JFK |
| DEST_CITY_NAME | New York, NY | New York, NY | New York, NY | New York, NY | New York, NY |
| DEST_STATE_NM | New York | New York | New York | New York | New York |
| CRS_DEP_TIME | 1359 | 1354 | 1529 | 1704 | 1325 |
| TAXI_OUT | 27.00 | 37.00 | 21.00 | 36.00 | 20.00 |
| CRS_ARR_TIME | 1719 | 1541 | 1709 | 1845 | 1454 |
| ARR_DEL15 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| CRS_ELAPSED_TIME | 140.00 | 107.00 | 100.00 | 101.00 | 89.00 |
| DISTANCE | 765.00 | 427.00 | 301.00 | 290.00 | 187.00 |
# 5 registros aleatorios
jfk.sample(5).T
| 182652 | 226923 | 54984 | 24157 | 245716 | |
|---|---|---|---|---|---|
| FL_DATE | 2018-05-22 | 2018-08-27 | 2019-03-15 | 2018-12-15 | 2018-10-19 |
| OP_CARRIER_AIRLINE_ID | 20,452.00 | 19,790.00 | 19,790.00 | 19,790.00 | 19,805.00 |
| TAIL_NUM | N401YX | N860DN | N327NW | NaN | N837NN |
| ORIGIN | DCA | SEA | PBI | MSY | SAN |
| ORIGIN_CITY_NAME | Washington, DC | Seattle, WA | West Palm Beach/Palm Beach, FL | New Orleans, LA | San Diego, CA |
| ORIGIN_STATE_NM | Virginia | Washington | Florida | Louisiana | California |
| DEST | JFK | JFK | JFK | JFK | JFK |
| DEST_CITY_NAME | New York, NY | New York, NY | New York, NY | New York, NY | New York, NY |
| DEST_STATE_NM | New York | New York | New York | New York | New York |
| CRS_DEP_TIME | 1458 | 1525 | 1206 | 1300 | 746 |
| TAXI_OUT | 21.00 | 13.00 | 15.00 | 8.00 | 23.00 |
| CRS_ARR_TIME | 1629 | 2359 | 1509 | 1654 | 1615 |
| ARR_DEL15 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| CRS_ELAPSED_TIME | 91.00 | 334.00 | 183.00 | 174.00 | 329.00 |
| DISTANCE | 213.00 | 2,422.00 | 1,028.00 | 1,182.00 | 2,446.00 |
# 5 últimos registros
jfk.tail().T
| 250147 | 250148 | 250149 | 250150 | 250151 | |
|---|---|---|---|---|---|
| FL_DATE | 2018-10-31 | 2018-10-31 | 2018-10-31 | 2018-10-31 | 2018-10-31 |
| OP_CARRIER_AIRLINE_ID | 20,398.00 | 20,398.00 | 20,452.00 | 20,452.00 | 20,452.00 |
| TAIL_NUM | N801AE | N848AE | N870RW | N818MD | N878RW |
| ORIGIN | BOS | RDU | CMH | CMH | PIT |
| ORIGIN_CITY_NAME | Boston, MA | Raleigh/Durham, NC | Columbus, OH | Columbus, OH | Pittsburgh, PA |
| ORIGIN_STATE_NM | Massachusetts | North Carolina | Ohio | Ohio | Pennsylvania |
| DEST | JFK | JFK | JFK | JFK | JFK |
| DEST_CITY_NAME | New York, NY | New York, NY | New York, NY | New York, NY | New York, NY |
| DEST_STATE_NM | New York | New York | New York | New York | New York |
| CRS_DEP_TIME | 1453 | 1944 | 1757 | 707 | 1033 |
| TAXI_OUT | 15.00 | 14.00 | 13.00 | 11.00 | 14.00 |
| CRS_ARR_TIME | 1619 | 2130 | 2000 | 900 | 1201 |
| ARR_DEL15 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| CRS_ELAPSED_TIME | 86.00 | 106.00 | 123.00 | 113.00 | 88.00 |
| DISTANCE | 187.00 | 427.00 | 483.00 | 483.00 | 340.00 |
Información del dataset
Cada fila nos da información de un vuelo determinado que aterriza en el aeropuerto JFK de Nueva York.\ Cada columna nos da la siguiente información:
NOTA: A la hora de elegir el target tenemos que tener en cuenta el tipo de modelo que es. Como se trata de un modelo de clasificación binaria, tomaremos como target la variable ARR_DEL15. Si es otro caso tomaríamos como target la variable ARR_DEL.
C.- Tipo de atributos disponibles
Vemos si los atributos de nuestro dataframe son numéricos o categóricos. Se clasifican, de menor a mayor espacio que ocupan, en:
Numeros (continuas)
Categoricas (discretas)
jfk.info(verbose=True)
<class 'pandas.core.frame.DataFrame'> Int64Index: 250152 entries, 0 to 250151 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FL_DATE 250152 non-null object 1 OP_CARRIER_AIRLINE_ID 237644 non-null float64 2 TAIL_NUM 235143 non-null object 3 ORIGIN 250152 non-null object 4 ORIGIN_CITY_NAME 250152 non-null object 5 ORIGIN_STATE_NM 250152 non-null object 6 DEST 250152 non-null object 7 DEST_CITY_NAME 250152 non-null object 8 DEST_STATE_NM 250152 non-null object 9 CRS_DEP_TIME 250152 non-null int64 10 TAXI_OUT 232641 non-null float64 11 CRS_ARR_TIME 250152 non-null int64 12 ARR_DEL15 250152 non-null float64 13 CRS_ELAPSED_TIME 250152 non-null float64 14 DISTANCE 250152 non-null float64 dtypes: float64(5), int64(2), object(8) memory usage: 30.5+ MB
D.- Estadísticos descriptivos básicos y distribución de los nulos
Estadísticos descriptivos
Mediante el método describe(), podemos obtener los estadísticos representativos de cada uno de los atributos del DataFrame.
# Atributos numéricos
jfk.describe(include = 'number').T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| OP_CARRIER_AIRLINE_ID | 237,644.00 | 20,152.00 | 297.88 | 19,690.00 | 19,790.00 | 20,363.00 | 20,409.00 | 21,171.00 |
| CRS_DEP_TIME | 250,152.00 | 1,334.61 | 529.10 | 3.00 | 915.00 | 1,258.00 | 1,744.00 | 2,359.00 |
| TAXI_OUT | 232,641.00 | 18.19 | 9.61 | 1.00 | 12.00 | 16.00 | 21.00 | 166.00 |
| CRS_ARR_TIME | 250,152.00 | 1,419.13 | 580.99 | 1.00 | 944.00 | 1,455.00 | 1,914.00 | 2,400.00 |
| ARR_DEL15 | 250,152.00 | 0.21 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| CRS_ELAPSED_TIME | 250,152.00 | 195.65 | 97.37 | 47.00 | 109.00 | 167.00 | 306.00 | 595.00 |
| DISTANCE | 250,152.00 | 1,258.84 | 898.12 | 94.00 | 427.00 | 1,028.00 | 2,248.00 | 4,983.00 |
# Atributos categóricos
jfk.describe(include=['object']).T
| count | unique | top | freq | |
|---|---|---|---|---|
| FL_DATE | 250152 | 730 | 2018-07-19 | 394 |
| TAIL_NUM | 235143 | 2645 | N110AN | 761 |
| ORIGIN | 250152 | 72 | LAX | 25345 |
| ORIGIN_CITY_NAME | 250152 | 70 | Los Angeles, CA | 25345 |
| ORIGIN_STATE_NM | 250152 | 32 | California | 51137 |
| DEST | 250152 | 1 | JFK | 250152 |
| DEST_CITY_NAME | 250152 | 1 | New York, NY | 250152 |
| DEST_STATE_NM | 250152 | 1 | New York | 250152 |
# Otra manera de mostrar las variables categóricas
jfk.select_dtypes(include=['object']).describe().T
| count | unique | top | freq | |
|---|---|---|---|---|
| FL_DATE | 250152 | 730 | 2018-07-19 | 394 |
| TAIL_NUM | 235143 | 2645 | N110AN | 761 |
| ORIGIN | 250152 | 72 | LAX | 25345 |
| ORIGIN_CITY_NAME | 250152 | 70 | Los Angeles, CA | 25345 |
| ORIGIN_STATE_NM | 250152 | 32 | California | 51137 |
| DEST | 250152 | 1 | JFK | 250152 |
| DEST_CITY_NAME | 250152 | 1 | New York, NY | 250152 |
| DEST_STATE_NM | 250152 | 1 | New York | 250152 |
# Otra manera más de mostrar las variables categóricas
jfk.describe(exclude='number').T # excluye integer y float
| count | unique | top | freq | |
|---|---|---|---|---|
| FL_DATE | 250152 | 730 | 2018-07-19 | 394 |
| TAIL_NUM | 235143 | 2645 | N110AN | 761 |
| ORIGIN | 250152 | 72 | LAX | 25345 |
| ORIGIN_CITY_NAME | 250152 | 70 | Los Angeles, CA | 25345 |
| ORIGIN_STATE_NM | 250152 | 32 | California | 51137 |
| DEST | 250152 | 1 | JFK | 250152 |
| DEST_CITY_NAME | 250152 | 1 | New York, NY | 250152 |
| DEST_STATE_NM | 250152 | 1 | New York | 250152 |
# Todas las variables juntas
jfk.describe(include = 'all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| FL_DATE | 250152 | 730 | 2018-07-19 | 394 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| OP_CARRIER_AIRLINE_ID | 237,644.00 | NaN | NaN | NaN | 20,152.00 | 297.88 | 19,690.00 | 19,790.00 | 20,363.00 | 20,409.00 | 21,171.00 |
| TAIL_NUM | 235143 | 2645 | N110AN | 761 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ORIGIN | 250152 | 72 | LAX | 25345 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ORIGIN_CITY_NAME | 250152 | 70 | Los Angeles, CA | 25345 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ORIGIN_STATE_NM | 250152 | 32 | California | 51137 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DEST | 250152 | 1 | JFK | 250152 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DEST_CITY_NAME | 250152 | 1 | New York, NY | 250152 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DEST_STATE_NM | 250152 | 1 | New York | 250152 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| CRS_DEP_TIME | 250,152.00 | NaN | NaN | NaN | 1,334.61 | 529.10 | 3.00 | 915.00 | 1,258.00 | 1,744.00 | 2,359.00 |
| TAXI_OUT | 232,641.00 | NaN | NaN | NaN | 18.19 | 9.61 | 1.00 | 12.00 | 16.00 | 21.00 | 166.00 |
| CRS_ARR_TIME | 250,152.00 | NaN | NaN | NaN | 1,419.13 | 580.99 | 1.00 | 944.00 | 1,455.00 | 1,914.00 | 2,400.00 |
| ARR_DEL15 | 250,152.00 | NaN | NaN | NaN | 0.21 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| CRS_ELAPSED_TIME | 250,152.00 | NaN | NaN | NaN | 195.65 | 97.37 | 47.00 | 109.00 | 167.00 | 306.00 | 595.00 |
| DISTANCE | 250,152.00 | NaN | NaN | NaN | 1,258.84 | 898.12 | 94.00 | 427.00 | 1,028.00 | 2,248.00 | 4,983.00 |
Podemos usar la clase DataFrameReporter que nos permite mostrar un descritivo de todos los atributos juntos, así como los nulos que tienen
report_jfk = DataFrameReporter(df = jfk).get_reports()
report_jfk
| Dtypes | Shape | Absolute_nulls | Relative_nulls | count | mean | std | min | 25% | 50% | 75% | max | count | unique | top | freq | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CRS_DEP_TIME | int64 | 250152 | 0 | 0.00 | 250,152.00 | 1,334.61 | 529.10 | 3.00 | 915.00 | 1,258.00 | 1,744.00 | 2,359.00 | ||||
| CRS_ARR_TIME | int64 | 250152 | 0 | 0.00 | 250,152.00 | 1,419.13 | 580.99 | 1.00 | 944.00 | 1,455.00 | 1,914.00 | 2,400.00 | ||||
| OP_CARRIER_AIRLINE_ID | float64 | 250152 | 12508 | 5.00 | 237,644.00 | 20,152.00 | 297.88 | 19,690.00 | 19,790.00 | 20,363.00 | 20,409.00 | 21,171.00 | ||||
| TAXI_OUT | float64 | 250152 | 17511 | 7.00 | 232,641.00 | 18.19 | 9.61 | 1.00 | 12.00 | 16.00 | 21.00 | 166.00 | ||||
| ARR_DEL15 | float64 | 250152 | 0 | 0.00 | 250,152.00 | 0.21 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ||||
| CRS_ELAPSED_TIME | float64 | 250152 | 0 | 0.00 | 250,152.00 | 195.65 | 97.37 | 47.00 | 109.00 | 167.00 | 306.00 | 595.00 | ||||
| DISTANCE | float64 | 250152 | 0 | 0.00 | 250,152.00 | 1,258.84 | 898.12 | 94.00 | 427.00 | 1,028.00 | 2,248.00 | 4,983.00 | ||||
| FL_DATE | object | 250152 | 0 | 0.00 | 250152 | 730 | 2018-07-19 | 394 | ||||||||
| TAIL_NUM | object | 250152 | 15009 | 6.00 | 235143 | 2645 | N110AN | 761 | ||||||||
| ORIGIN | object | 250152 | 0 | 0.00 | 250152 | 72 | LAX | 25345 | ||||||||
| ORIGIN_CITY_NAME | object | 250152 | 0 | 0.00 | 250152 | 70 | Los Angeles, CA | 25345 | ||||||||
| ORIGIN_STATE_NM | object | 250152 | 0 | 0.00 | 250152 | 32 | California | 51137 | ||||||||
| DEST | object | 250152 | 0 | 0.00 | 250152 | 1 | JFK | 250152 | ||||||||
| DEST_CITY_NAME | object | 250152 | 0 | 0.00 | 250152 | 1 | New York, NY | 250152 | ||||||||
| DEST_STATE_NM | object | 250152 | 0 | 0.00 | 250152 | 1 | New York | 250152 |
Variables de baja varianza
Son variables que van a ser irrelevantes para el modelo (no le van a aportar información relevante).
Se eliminan aquellas variables numéricas cuya varianza sea inferior a 0.15
varianza_nula (dataframe = jfk, std = 0.15)
Se han borrado las siguientes variables numéricas por tener baja varianza: []
| FL_DATE | OP_CARRIER_AIRLINE_ID | TAIL_NUM | ORIGIN | ORIGIN_CITY_NAME | ORIGIN_STATE_NM | DEST | DEST_CITY_NAME | DEST_STATE_NM | CRS_DEP_TIME | TAXI_OUT | CRS_ARR_TIME | ARR_DEL15 | CRS_ELAPSED_TIME | DISTANCE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2018-02-01 | 20,363.00 | N299PQ | BNA | Nashville, TN | Tennessee | JFK | New York, NY | New York | 1359 | 27.00 | 1719 | 0.00 | 140.00 | 765.00 |
| 1 | 2018-02-01 | 20,363.00 | N920XJ | RDU | Raleigh/Durham, NC | North Carolina | JFK | New York, NY | New York | 1354 | 37.00 | 1541 | 0.00 | 107.00 | 427.00 |
| 2 | 2018-02-01 | 20,363.00 | N605LR | BUF | Buffalo, NY | New York | JFK | New York, NY | New York | 1529 | 21.00 | 1709 | 0.00 | 100.00 | 301.00 |
| 3 | 2018-02-01 | 20,363.00 | N800AY | ORF | Norfolk, VA | Virginia | JFK | New York, NY | New York | 1704 | 36.00 | 1845 | 0.00 | 101.00 | 290.00 |
| 4 | 2018-02-01 | 20,363.00 | N600LR | BOS | Boston, MA | Massachusetts | JFK | New York, NY | New York | 1325 | 20.00 | 1454 | 0.00 | 89.00 | 187.00 |
En este dataset, encontramos tres variables categóricas sin varianza: DEST, DEST_CITY_NAME y DEST_STATE_NM, ya que todos los registros de vuelo tienen como destino el aeropuerto JFK de Nueva York (unique = 1). Es por ello que la información que le aportan estos atributos al modelo es irrelevante. Las eliminamos.
# Elimino variables categoricas sin varianza
jfk.drop(['DEST', 'DEST_CITY_NAME', 'DEST_STATE_NM'], axis = 1, inplace = True)
Distribución de los nulos
Trataremos por separado los atributos numéricos y categóricos para mantener una visión clara.
Seleccionamos las variables utilizando el método selec_types() y a continuación invocamos el método de Pandas isnull().
# Atributos numéricos
jfk.select_dtypes(include=['number']).isnull().sum()
OP_CARRIER_AIRLINE_ID 12508 CRS_DEP_TIME 0 TAXI_OUT 17511 CRS_ARR_TIME 0 ARR_DEL15 0 CRS_ELAPSED_TIME 0 DISTANCE 0 dtype: int64
jfk.select_dtypes(include=['object']).isnull().sum()
FL_DATE 0 TAIL_NUM 15009 ORIGIN 0 ORIGIN_CITY_NAME 0 ORIGIN_STATE_NM 0 dtype: int64
# Todos los atributos juntos
jfk.isnull().sum()
FL_DATE 0 OP_CARRIER_AIRLINE_ID 12508 TAIL_NUM 15009 ORIGIN 0 ORIGIN_CITY_NAME 0 ORIGIN_STATE_NM 0 CRS_DEP_TIME 0 TAXI_OUT 17511 CRS_ARR_TIME 0 ARR_DEL15 0 CRS_ELAPSED_TIME 0 DISTANCE 0 dtype: int64
Tambien podemos usar la función nulos_variable()
nulos_variable (dataframe = jfk)
Las variables numéricas que tienen nulos son: OP_CARRIER_AIRLINE_ID 12508 TAXI_OUT 17511 dtype: int64 Hay un total de 2 variables numéricas con nulos Las variables categóricas que tienen nulos son: TAIL_NUM 15009 dtype: int64 Hay un total de 1 variables categóricas con nulos VARIABLES TOTALES CON NULOS: 3
Análisis rápido de los atributos
Análisis de las variables numéricas:
Análisis de las variables categóricas:
Análisis del target (ARR_DEL15):
# Lo exporto a csv para poder trabajar con el
report_jfk.to_csv('report/jfk_describe.csv')
E.- Distribución de los atributos
Análisis de FL_DATE
Vemos que se trata de un object. Dos formas de verlo:
jfk['FL_DATE'].dtypes
dtype('O')
jfk[['FL_DATE']].dtypes
FL_DATE object dtype: object
Lo convertimos a formato fecha, aplicando la función Datetime de Pandas
jfk['FL_DATE'] = pd.to_datetime(jfk['FL_DATE'])
Separamos las variables numéricas, booleanas y categóricas, así como el target
# Indico cual es la variable a predecir
target = 'ARR_DEL15'
# Guardo las variables en su lista correspondiente
list_num, list_bool, list_cat = obtener_lista_variables(dataset = jfk)
# Variables numéricas
list_num
['OP_CARRIER_AIRLINE_ID', 'CRS_DEP_TIME', 'TAXI_OUT', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 'DISTANCE']
# Variable booleanas
list_bool
[]
# Variable categóricas
list_cat
['TAIL_NUM', 'ORIGIN', 'ORIGIN_CITY_NAME', 'ORIGIN_STATE_NM']
Análisis del Target o Clase
En un problema de clasificación supervisada, es importante conocer la distribución del target.\ Se trata de un target categórico por lo que debemos saber cuantas categorías tiene y el número de registros que hay en cada una de ellas.
# Usamos el método value_counts() de Series Pandas
jfk[target].value_counts(dropna = False) # Con False muestra también los nulos
0.00 197520 1.00 52632 Name: ARR_DEL15, dtype: int64
Asimismo, podemos utilizar Seaborn para obtener una visualización sencilla.
# Visualización del target
sns.countplot(data = jfk, x = target);
Vemos que el target contiene 2 valores:
Nota: En caso de tener valores True/False, SI/NO, YES/NO, los convertimos a 1/0:
# Ejemplo de cómo convertirlos los string YES/NO en booleanos 1/0
#df['ARR_DEL15'] = (df['ARR_DEL15'] == 'Yes').astype(int)
A través de la clase DataFrameReporter habíamos visto una serie de descriptivos. Para el target:
Todos estos valores también se pueden obtener de forma independiente, aplicando unas funciones sobre la Serie del target:
# Número de registros de la Serie
jfk[target].count()
250152
# Valor mínimo de la Serie
jfk[target].min()
0.0
# Valor medio de la Serie
jfk[target].mean()
0.21040007675333397
Me está diciendo que el 21% de los vuelos se retrasan más de 15 minutos (ratio de prevalencia de la clase positiva). O lo que es lo mismo, el 79% de los vuelos no se retrasan más de 15 minutos.
# Valor máximo de la Serie
jfk[target].max()
1.0
# Número de vuelos que se retrasan más de 15 minutos
jfk[target].sum()
52632.0
Análisis de las variables numéricas
# instanciate the figure
fig = plt.figure(figsize = (20, 8))
for n, num in enumerate (list_num): # for loop aplicado a todas las variables numéricas
fig.add_subplot(2, 3, n+1) # Método para indicar el número de gráficos por filas y columnas
jfk[num].hist(bins=20) # Represento con un histograma cada atributo
plt.title(num) # Método para darle un título a la figura
# create a title for the figure
fig.suptitle ('Distribución de las variables numéricas', fontsize = 16);
Correlación entre variables numéricas
corr = jfk.corr()
corr.style.background_gradient(cmap="coolwarm") # parámetros de configuración de la tabla de correlación
| OP_CARRIER_AIRLINE_ID | CRS_DEP_TIME | TAXI_OUT | CRS_ARR_TIME | ARR_DEL15 | CRS_ELAPSED_TIME | DISTANCE | |
|---|---|---|---|---|---|---|---|
| OP_CARRIER_AIRLINE_ID | 1.000000 | -0.028113 | -0.029900 | -0.070828 | 0.043606 | -0.404431 | -0.388482 |
| CRS_DEP_TIME | -0.028113 | 1.000000 | 0.005575 | 0.111409 | 0.121370 | 0.159306 | 0.158418 |
| TAXI_OUT | -0.029900 | 0.005575 | 1.000000 | 0.088735 | 0.280887 | -0.020412 | -0.039050 |
| CRS_ARR_TIME | -0.070828 | 0.111409 | 0.088735 | 1.000000 | 0.118796 | -0.034389 | -0.064273 |
| ARR_DEL15 | 0.043606 | 0.121370 | 0.280887 | 0.118796 | 1.000000 | -0.004796 | -0.008876 |
| CRS_ELAPSED_TIME | -0.404431 | 0.159306 | -0.020412 | -0.034389 | -0.004796 | 1.000000 | 0.995510 |
| DISTANCE | -0.388482 | 0.158418 | -0.039050 | -0.064273 | -0.008876 | 0.995510 | 1.000000 |
Existe una correlación muy fuerte entre la distancia y la duración del vuelo. Eliminamos una de ellas.
# Elimino la variable
del jfk['DISTANCE']
Análisis de las variables categóricas
for i in list_cat: # for loop aplicado a todas las variables categóricas
print(jfk[i].value_counts(),'\n')
N110AN 761
N113AN 751
N111ZM 748
N115NN 740
N109NN 740
...
N197UW 1
N986AN 1
N315RJ 1
N801AW 1
N945DN 1
Name: TAIL_NUM, Length: 2645, dtype: int64
LAX 25345
SFO 15228
BOS 11201
MCO 9450
CLT 7769
FLL 7645
LAS 7377
MIA 7071
BUF 7049
ATL 6687
RDU 6665
SEA 6491
SJU 5712
ORD 5647
PHX 5460
TPA 5240
DCA 4930
SLC 4329
SAN 4317
AUS 4257
ROC 4181
JAX 4143
MSY 3707
PBI 3688
CLE 3681
BTV 3674
SYR 3479
CHS 3338
IAD 3267
DFW 3231
BWI 3179
PWM 3155
ORF 3102
BNA 3004
DTW 2911
DEN 2911
SAV 2847
MSP 2806
PIT 2686
CMH 2575
PDX 2528
RSW 2122
IND 2086
RIC 1951
SJC 1699
CVG 1395
LGB 1375
BUR 996
SAT 994
ACK 919
BQN 858
PHL 851
SRQ 764
HNL 754
SMF 684
PSE 653
HOU 648
ABQ 602
ORH 577
OAK 555
RNO 525
STT 478
ONT 465
BGR 442
PSP 405
MVY 405
DAB 365
IAH 192
HYA 171
EGE 161
SNA 68
JAC 29
Name: ORIGIN, dtype: int64
Los Angeles, CA 25345
San Francisco, CA 15228
Boston, MA 11201
Orlando, FL 9450
Washington, DC 8197
Charlotte, NC 7769
Fort Lauderdale, FL 7645
Las Vegas, NV 7377
Miami, FL 7071
Buffalo, NY 7049
Atlanta, GA 6687
Raleigh/Durham, NC 6665
Seattle, WA 6491
San Juan, PR 5712
Chicago, IL 5647
Phoenix, AZ 5460
Tampa, FL 5240
Salt Lake City, UT 4329
San Diego, CA 4317
Austin, TX 4257
Rochester, NY 4181
Jacksonville, FL 4143
New Orleans, LA 3707
West Palm Beach/Palm Beach, FL 3688
Cleveland, OH 3681
Burlington, VT 3674
Syracuse, NY 3479
Charleston, SC 3338
Dallas/Fort Worth, TX 3231
Baltimore, MD 3179
Portland, ME 3155
Norfolk, VA 3102
Nashville, TN 3004
Detroit, MI 2911
Denver, CO 2911
Savannah, GA 2847
Minneapolis, MN 2806
Pittsburgh, PA 2686
Columbus, OH 2575
Portland, OR 2528
Fort Myers, FL 2122
Indianapolis, IN 2086
Richmond, VA 1951
San Jose, CA 1699
Cincinnati, OH 1395
Long Beach, CA 1375
Burbank, CA 996
San Antonio, TX 994
Nantucket, MA 919
Aguadilla, PR 858
Philadelphia, PA 851
Houston, TX 840
Sarasota/Bradenton, FL 764
Honolulu, HI 754
Sacramento, CA 684
Ponce, PR 653
Albuquerque, NM 602
Worcester, MA 577
Oakland, CA 555
Reno, NV 525
Charlotte Amalie, VI 478
Ontario, CA 465
Bangor, ME 442
Martha's Vineyard, MA 405
Palm Springs, CA 405
Daytona Beach, FL 365
Hyannis, MA 171
Eagle, CO 161
Santa Ana, CA 68
Jackson, WY 29
Name: ORIGIN_CITY_NAME, dtype: int64
California 51137
Florida 40488
New York 14709
North Carolina 14434
Massachusetts 13273
Virginia 13250
Georgia 9534
Texas 9322
Nevada 7902
Puerto Rico 7223
Washington 6491
Ohio 6256
Illinois 5647
Arizona 5460
Utah 4329
Louisiana 3707
Vermont 3674
Maine 3597
Pennsylvania 3537
South Carolina 3338
Maryland 3179
Colorado 3072
Tennessee 3004
Michigan 2911
Minnesota 2806
Oregon 2528
Indiana 2086
Kentucky 1395
Hawaii 754
New Mexico 602
U.S. Virgin Islands 478
Wyoming 29
Name: ORIGIN_STATE_NM, dtype: int64
A.- FL_DATE
En el análisis anterior vimos que la mayoría de los vuelos proceden de Los Angeles. Veamos cuántos vuelos procedentes de Los Ángeles se retrasan cada día.
# Vuelos procedentes de Los Ángeles
lax_jfk = jfk[jfk['ORIGIN']=='LAX'] # boolean indexing
lax_jfk.head()
| FL_DATE | OP_CARRIER_AIRLINE_ID | TAIL_NUM | ORIGIN | ORIGIN_CITY_NAME | ORIGIN_STATE_NM | CRS_DEP_TIME | TAXI_OUT | CRS_ARR_TIME | ARR_DEL15 | CRS_ELAPSED_TIME | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 34 | 2018-02-01 | 19,805.00 | N104NN | LAX | Los Angeles, CA | California | 1530 | 40.00 | 2359 | 1.00 | 329.00 |
| 35 | 2018-02-01 | 19,805.00 | N117AN | LAX | Los Angeles, CA | California | 1045 | 24.00 | 1919 | 0.00 | 334.00 |
| 36 | 2018-02-01 | 19,805.00 | N111ZM | LAX | Los Angeles, CA | California | 1630 | 36.00 | 59 | 0.00 | 329.00 |
| 39 | 2018-02-01 | 19,805.00 | N101NN | LAX | Los Angeles, CA | California | 1245 | 17.00 | 2123 | 0.00 | 338.00 |
| 100 | 2018-02-01 | 19,790.00 | NaN | LAX | Los Angeles, CA | California | 1130 | 27.00 | 2000 | 0.00 | 330.00 |
# Agrupamos por días (convirtiéndose FT_DATE en el índice)
lax_jfk_dia = lax_jfk[['FL_DATE', target]].groupby('FL_DATE').agg([len, sum, np.mean]) # número de vuelos, vuelos con retraso, media de los vuelos con retraso
lax_jfk_dia.head()
| ARR_DEL15 | |||
|---|---|---|---|
| len | sum | mean | |
| FL_DATE | |||
| 2018-01-01 | 31 | 10.00 | 0.32 |
| 2018-01-02 | 35 | 13.00 | 0.37 |
| 2018-01-03 | 28 | 3.00 | 0.11 |
| 2018-01-04 | 2 | 1.00 | 0.50 |
| 2018-01-05 | 29 | 21.00 | 0.72 |
Por ejemplo, el 5 de enero del 2018, aterrizaron 29 vuelos procedentes de Los Ángeles, de los cuales 21 llegaron con retraso. Es decir, un 72% de los vuelos procedentes de Los Ángeles de ese día, llegaron con retraso.
Otra forma de colocar FL_DATE como índice:
lax_jfk.set_index('FL_DATE').head() # para que se guarden los cambios hay que aplicar inplace = True
| OP_CARRIER_AIRLINE_ID | TAIL_NUM | ORIGIN | ORIGIN_CITY_NAME | ORIGIN_STATE_NM | CRS_DEP_TIME | TAXI_OUT | CRS_ARR_TIME | ARR_DEL15 | CRS_ELAPSED_TIME | |
|---|---|---|---|---|---|---|---|---|---|---|
| FL_DATE | ||||||||||
| 2018-02-01 | 19,805.00 | N104NN | LAX | Los Angeles, CA | California | 1530 | 40.00 | 2359 | 1.00 | 329.00 |
| 2018-02-01 | 19,805.00 | N117AN | LAX | Los Angeles, CA | California | 1045 | 24.00 | 1919 | 0.00 | 334.00 |
| 2018-02-01 | 19,805.00 | N111ZM | LAX | Los Angeles, CA | California | 1630 | 36.00 | 59 | 0.00 | 329.00 |
| 2018-02-01 | 19,805.00 | N101NN | LAX | Los Angeles, CA | California | 1245 | 17.00 | 2123 | 0.00 | 338.00 |
| 2018-02-01 | 19,790.00 | NaN | LAX | Los Angeles, CA | California | 1130 | 27.00 | 2000 | 0.00 | 330.00 |
Visualizamos en un gráfico los retrasos. Para ello, la fecha tiene que estar en el índice
pd.DataFrame.plot?
Init signature: pd.DataFrame.plot(data) Docstring: Make plots of Series or DataFrame. Uses the backend specified by the option ``plotting.backend``. By default, matplotlib is used. Parameters ---------- data : Series or DataFrame The object for which the method is called. x : label or position, default None Only used if data is a DataFrame. y : label, position or list of label, positions, default None Allows plotting of one column versus another. Only used if data is a DataFrame. kind : str The kind of plot to produce: - 'line' : line plot (default) - 'bar' : vertical bar plot - 'barh' : horizontal bar plot - 'hist' : histogram - 'box' : boxplot - 'kde' : Kernel Density Estimation plot - 'density' : same as 'kde' - 'area' : area plot - 'pie' : pie plot - 'scatter' : scatter plot (DataFrame only) - 'hexbin' : hexbin plot (DataFrame only) ax : matplotlib axes object, default None An axes of the current figure. subplots : bool, default False Make separate subplots for each column. sharex : bool, default True if ax is None else False In case ``subplots=True``, share x axis and set some x axis labels to invisible; defaults to True if ax is None otherwise False if an ax is passed in; Be aware, that passing in both an ax and ``sharex=True`` will alter all x axis labels for all axis in a figure. sharey : bool, default False In case ``subplots=True``, share y axis and set some y axis labels to invisible. layout : tuple, optional (rows, columns) for the layout of subplots. figsize : a tuple (width, height) in inches Size of a figure object. use_index : bool, default True Use index as ticks for x axis. title : str or list Title to use for the plot. If a string is passed, print the string at the top of the figure. If a list is passed and `subplots` is True, print each item in the list above the corresponding subplot. grid : bool, default None (matlab style default) Axis grid lines. legend : bool or {'reverse'} Place legend on axis subplots. style : list or dict The matplotlib line style per column. logx : bool or 'sym', default False Use log scaling or symlog scaling on x axis. .. versionchanged:: 0.25.0 logy : bool or 'sym' default False Use log scaling or symlog scaling on y axis. .. versionchanged:: 0.25.0 loglog : bool or 'sym', default False Use log scaling or symlog scaling on both x and y axes. .. versionchanged:: 0.25.0 xticks : sequence Values to use for the xticks. yticks : sequence Values to use for the yticks. xlim : 2-tuple/list Set the x limits of the current axes. ylim : 2-tuple/list Set the y limits of the current axes. xlabel : label, optional Name to use for the xlabel on x-axis. Default uses index name as xlabel, or the x-column name for planar plots. .. versionadded:: 1.1.0 .. versionchanged:: 1.2.0 Now applicable to planar plots (`scatter`, `hexbin`). ylabel : label, optional Name to use for the ylabel on y-axis. Default will show no ylabel, or the y-column name for planar plots. .. versionadded:: 1.1.0 .. versionchanged:: 1.2.0 Now applicable to planar plots (`scatter`, `hexbin`). rot : int, default None Rotation for ticks (xticks for vertical, yticks for horizontal plots). fontsize : int, default None Font size for xticks and yticks. colormap : str or matplotlib colormap object, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. colorbar : bool, optional If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots). position : float Specify relative alignments for bar plot layout. From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center). table : bool, Series or DataFrame, default False If True, draw a table using the data in the DataFrame and the data will be transposed to meet matplotlib's default layout. If a Series or DataFrame is passed, use passed data to draw a table. yerr : DataFrame, Series, array-like, dict and str See :ref:`Plotting with Error Bars <visualization.errorbars>` for detail. xerr : DataFrame, Series, array-like, dict and str Equivalent to yerr. stacked : bool, default False in line and bar plots, and True in area plot If True, create stacked plot. sort_columns : bool, default False Sort column names to determine plot ordering. secondary_y : bool or sequence, default False Whether to plot on the secondary y-axis if a list/tuple, which columns to plot on secondary y-axis. mark_right : bool, default True When using a secondary_y axis, automatically mark the column labels with "(right)" in the legend. include_bool : bool, default is False If True, boolean values can be plotted. backend : str, default None Backend to use instead of the backend specified in the option ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to specify the ``plotting.backend`` for the whole session, set ``pd.options.plotting.backend``. .. versionadded:: 1.0.0 **kwargs Options to pass to matplotlib plotting method. Returns ------- :class:`matplotlib.axes.Axes` or numpy.ndarray of them If the backend is not the default matplotlib one, the return value will be the object returned by the backend. Notes ----- - See matplotlib documentation online for more on this subject - If `kind` = 'bar' or 'barh', you can specify relative alignments for bar plot layout by `position` keyword. From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) File: c:\users\jagui\anaconda3\lib\site-packages\pandas\plotting\_core.py Type: type Subclasses:
Retrasos diarios
lax_jfk_daily = lax_jfk_dia[target, 'mean']
lax_jfk_daily.head()
FL_DATE 2018-01-01 0.32 2018-01-02 0.37 2018-01-03 0.11 2018-01-04 0.50 2018-01-05 0.72 Name: (ARR_DEL15, mean), dtype: float64
# Representamos el target diario
fig = plt.figure(figsize=[15,5])
lax_jfk_daily.plot(kind = 'line', ylabel = target) # En el eje x automáticamente se coloca el índice (FL_DATE)
fig.suptitle ('Media diaria de retrasos de los vuelos procedentes de Los Angeles', fontsize = 16);
Retrasos semanales
Si queremos la media semanal del target podemos utilizar la función resample().
# Agrupamos por semanas (convirtiéndose FT_DATE en el índice), aplicando la media para cada una de las semanas
lax_jfk_weekly = lax_jfk_daily.resample('W').mean()
lax_jfk_weekly.head()
FL_DATE 2018-01-07 0.48 2018-01-14 0.24 2018-01-21 0.18 2018-01-28 0.09 2018-02-04 0.12 Freq: W-SUN, Name: (ARR_DEL15, mean), dtype: float64
# Representamos el target semanal
fig = plt.figure(figsize = [15,5])
lax_jfk_weekly.plot(kind = 'line', ylabel = target)
fig.suptitle ('Media semanal de retrasos de los vuelos procedentes de Los Angeles', fontsize = 16 );
Retrasos mensuales
Si queremos la media mensual del target, utilizando la función resample():
lax_jfk_monthly = lax_jfk_daily.resample('M').mean()
lax_jfk_monthly.head()
FL_DATE 2018-01-31 0.23 2018-02-28 0.10 2018-03-31 0.16 2018-04-30 0.16 2018-05-31 0.20 Freq: M, Name: (ARR_DEL15, mean), dtype: float64
fig = plt.figure(figsize = [15,5])
lax_jfk_monthly.plot(kind = 'line', ylabel = target)
fig.suptitle('Media mensual de retrasos de los vuelos procedentes de Los Angeles', fontsize = 16 );
Vemos en los gráficos que hay ciertos patrones que se repiten según la estación del año. Hay cierta estacionalidad (En Verano se producen los mayores retrasos y en invierno los menores retrasos).
# Actualizamos las listas de variables
list_num, list_bool, list_cat = obtener_lista_variables(dataset = jfk)
B.- VARIABLES NUMÉRICAS
list_num
['OP_CARRIER_AIRLINE_ID', 'CRS_DEP_TIME', 'TAXI_OUT', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME']
Usamos un pivot_table para analizar las variables con respecto al target y las representamos mediante un boxplot.
analisis_numericas??
Signature: analisis_numericas(dataframe, column, target) Source: def analisis_numericas (dataframe, column, target): ''' Pivot-table que me da las medidas de las variables numericas y su correlación con el target ''' print(f"\nANÁLISIS DE {column} CON RESPECTO AL {target}\n") return dataframe.pivot_table(index=column, values=target, aggfunc=[len, min, np.mean, max]).sort_values(by=(column),ascending=True) File: c:\users\jagui\appdata\local\temp\ipykernel_11692\2226796377.py Type: function
plot_numericas??
Signature: plot_numericas(dataframe, column, target) Source: def plot_numericas(dataframe,column,target): ''' Visualización de las variables numéricas y su correlación con el target ''' plt.figure(figsize=[8,3]) plt.title(f'Analisis de {column} VS {target}') sns.boxplot(x = target, y = column, data = dataframe, palette="coolwarm") return plt.show() File: c:\users\jagui\appdata\local\temp\ipykernel_11692\3268369109.py Type: function
for i in list_num: # for loop para todas las variables numéricas
print(analisis_numericas(dataframe = jfk, column = i, target = target))
plot_numericas(dataframe = jfk, column = i, target = target)
ANÁLISIS DE OP_CARRIER_AIRLINE_ID CON RESPECTO AL ARR_DEL15
len min mean max
ARR_DEL15 ARR_DEL15 ARR_DEL15 ARR_DEL15
OP_CARRIER_AIRLINE_ID
19,690.00 695 0.00 0.29 1.00
19,790.00 58894 0.00 0.17 1.00
19,805.00 30694 0.00 0.20 1.00
19,930.00 8591 0.00 0.23 1.00
20,304.00 3993 0.00 0.25 1.00
20,363.00 40833 0.00 0.19 1.00
20,397.00 555 0.00 0.22 1.00
20,398.00 8814 0.00 0.23 1.00
20,409.00 78587 0.00 0.25 1.00
20,452.00 5044 0.00 0.17 1.00
21,171.00 944 0.00 0.15 1.00
ANÁLISIS DE CRS_DEP_TIME CON RESPECTO AL ARR_DEL15
len min mean max
ARR_DEL15 ARR_DEL15 ARR_DEL15 ARR_DEL15
CRS_DEP_TIME
3 1 0.00 0.00 0.00
4 53 0.00 0.25 1.00
5 1 0.00 0.00 0.00
9 1 0.00 0.00 0.00
10 1 0.00 0.00 0.00
... ... ... ... ...
2355 536 0.00 0.26 1.00
2356 181 0.00 0.29 1.00
2357 142 0.00 0.16 1.00
2358 280 0.00 0.17 1.00
2359 2953 0.00 0.24 1.00
[1200 rows x 4 columns]
ANÁLISIS DE TAXI_OUT CON RESPECTO AL ARR_DEL15
len min mean max
ARR_DEL15 ARR_DEL15 ARR_DEL15 ARR_DEL15
TAXI_OUT
1.00 1 0.00 0.00 0.00
2.00 5 0.00 0.80 1.00
3.00 11 0.00 0.18 1.00
4.00 12 0.00 0.08 1.00
5.00 42 0.00 0.19 1.00
... ... ... ... ...
156.00 1 1.00 1.00 1.00
159.00 1 1.00 1.00 1.00
161.00 1 1.00 1.00 1.00
162.00 2 1.00 1.00 1.00
166.00 1 1.00 1.00 1.00
[154 rows x 4 columns]
ANÁLISIS DE CRS_ARR_TIME CON RESPECTO AL ARR_DEL15
len min mean max
ARR_DEL15 ARR_DEL15 ARR_DEL15 ARR_DEL15
CRS_ARR_TIME
1 441 0.00 0.28 1.00
2 27 0.00 0.07 1.00
3 162 0.00 0.35 1.00
4 137 0.00 0.26 1.00
5 313 0.00 0.24 1.00
... ... ... ... ...
2356 120 0.00 0.31 1.00
2357 451 0.00 0.25 1.00
2358 305 0.00 0.27 1.00
2359 927 0.00 0.24 1.00
2400 13 0.00 0.31 1.00
[1260 rows x 4 columns]
ANÁLISIS DE CRS_ELAPSED_TIME CON RESPECTO AL ARR_DEL15
len min mean max
ARR_DEL15 ARR_DEL15 ARR_DEL15 ARR_DEL15
CRS_ELAPSED_TIME
47.00 39 0.00 0.67 1.00
50.00 22 0.00 0.18 1.00
52.00 30 0.00 0.43 1.00
54.00 113 0.00 0.44 1.00
55.00 50 0.00 0.42 1.00
... ... ... ... ...
581.00 11 0.00 0.64 1.00
585.00 154 0.00 0.41 1.00
586.00 2 1.00 1.00 1.00
590.00 102 0.00 0.21 1.00
595.00 24 0.00 0.17 1.00
[329 rows x 4 columns]
Vemos, por ejemplo en el TAXI_OUT, que la caja del 1.0 está por encima de la del 0.0. Eso significa que el TAXI_OUT es mayor en los vuelos que se retrasan. Por lo tanto el TAXI_OUT va a ser una variable muy importante para el target
Vemos con más detenimiento alguna de las variables, con otro tipo de gráficos.
# Hora de salida con vuelos con retraso
jfk[(jfk[target]==1)]['CRS_ARR_TIME'].plot(kind='hist', bins = 20)
# Hora de salida con vuelos sin retraso
jfk[(jfk[target]==0)]['CRS_ARR_TIME'].plot(kind='hist', bins = 20, alpha = 0.5);
En las últimas horas del día es donde se producen los mayores retrasos en la salida de los vuelos.
# Hora de llegada con vuelos con retraso
jfk[(jfk[target]==1)]['CRS_DEP_TIME'].plot(kind='hist', bins = 20)
# Hora de llegada con vuelos sin retraso
jfk[(jfk[target]==0)]['CRS_DEP_TIME'].plot(kind='hist', bins = 20, alpha = 0.5);
En las horas centrales del día es donde se producen los mayores retrasos en la llegada de los vuelos.
Como alternativa, podemos utilizar los métodos de seaborn, que nos permiten además mostrar de una manera muy sencilla gráficos combinados con varias variables.
C.- VARIABLES CATEGÓRICAS
list_cat
['TAIL_NUM', 'ORIGIN', 'ORIGIN_CITY_NAME', 'ORIGIN_STATE_NM']
Realizamos el análisis utilizando un pivot_table y el countplot de seaborn (utilizando el parámetro hue para hacer un split de las categorías que tiene el target)
sns.countplot?
Signature: sns.countplot( data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, orient=None, color=None, palette=None, saturation=0.75, width=0.8, dodge=True, ax=None, **kwargs, ) Docstring: Show the counts of observations in each categorical bin using bars. A count plot can be thought of as a histogram across a categorical, instead of quantitative, variable. The basic API and options are identical to those for :func:`barplot`, so you can compare counts across nested variables. Note that the newer :func:`histplot` function offers more functionality, although its default behavior is somewhat different. .. note:: This function always treats one of the variables as categorical and draws data at ordinal positions (0, 1, ... n) on the relevant axis, even when the data has a numeric or date type. See the :ref:`tutorial <categorical_tutorial>` for more information. Parameters ---------- data : DataFrame, array, or list of arrays, optional Dataset for plotting. If ``x`` and ``y`` are absent, this is interpreted as wide-form. Otherwise it is expected to be long-form. x, y, hue : names of variables in ``data`` or vector data, optional Inputs for plotting long-form data. See examples for interpretation. order, hue_order : lists of strings, optional Order to plot the categorical levels in; otherwise the levels are inferred from the data objects. orient : "v" | "h", optional Orientation of the plot (vertical or horizontal). This is usually inferred based on the type of the input variables, but it can be used to resolve ambiguity when both `x` and `y` are numeric or when plotting wide-form data. color : matplotlib color, optional Single color for the elements in the plot. palette : palette name, list, or dict Colors to use for the different levels of the ``hue`` variable. Should be something that can be interpreted by :func:`color_palette`, or a dictionary mapping hue levels to matplotlib colors. saturation : float, optional Proportion of the original saturation to draw colors at. Large patches often look better with slightly desaturated colors, but set this to `1` if you want the plot colors to perfectly match the input color. dodge : bool, optional When hue nesting is used, whether elements should be shifted along the categorical axis. ax : matplotlib Axes, optional Axes object to draw the plot onto, otherwise uses the current Axes. kwargs : key, value mappings Other keyword arguments are passed through to :meth:`matplotlib.axes.Axes.bar`. Returns ------- ax : matplotlib Axes Returns the Axes object with the plot drawn onto it. See Also -------- barplot : Show point estimates and confidence intervals using bars. catplot : Combine a categorical plot with a :class:`FacetGrid`. Examples -------- .. include:: ../docstrings/countplot.rst File: c:\users\jagui\anaconda3\lib\site-packages\seaborn\categorical.py Type: function
analisis_categoricas??
Signature: analisis_categoricas(dataframe, column, target) Source: def analisis_categoricas (dataframe,column,target): ''' Pivot-table que me da las medidas de las variables categóricas y su correlación con el target ''' print(f"\nANÁLISIS DE {column} CON RESPECTO AL {target}\n") return dataframe.pivot_table(index=column, values=target, aggfunc=[len,sum,np.mean]).sort_values(by=('len', target),ascending=False) File: c:\users\jagui\appdata\local\temp\ipykernel_11692\4246548799.py Type: function
plot_categoricas??
Signature: plot_categoricas(dataframe, column, target) Source: def plot_categoricas(dataframe,column,target): ''' Visualización de las variables categóricas y su correlación con el target ''' if column == 'TAIL_NUM': pass else: plt.figure(figsize=[20,5]) plt.title(f'Analisis de {column} VS {target}') plt.xticks(rotation = 90) sns.countplot(data = dataframe, x = column, hue = target ) return plt.show() File: c:\users\jagui\appdata\local\temp\ipykernel_11692\3984434524.py Type: function
for i in list_cat: # for loop para todas las variables categóricas
print(analisis_categoricas(dataframe = jfk, column = i, target = target))
plot_categoricas(dataframe = jfk, column = i, target = target)
ANÁLISIS DE TAIL_NUM CON RESPECTO AL ARR_DEL15
len sum mean
ARR_DEL15 ARR_DEL15 ARR_DEL15
TAIL_NUM
N110AN 761 164.00 0.22
N113AN 751 134.00 0.18
N111ZM 748 162.00 0.22
N115NN 740 121.00 0.16
N109NN 740 118.00 0.16
... ... ... ...
N921DN 1 0.00 0.00
N703DN 1 0.00 0.00
N922DX 1 0.00 0.00
N924DN 1 0.00 0.00
N928AM 1 0.00 0.00
[2645 rows x 3 columns]
ANÁLISIS DE ORIGIN CON RESPECTO AL ARR_DEL15
len sum mean
ARR_DEL15 ARR_DEL15 ARR_DEL15
ORIGIN
LAX 25345 4,803.00 0.19
SFO 15228 3,448.00 0.23
BOS 11201 2,283.00 0.20
MCO 9450 2,400.00 0.25
CLT 7769 1,764.00 0.23
FLL 7645 2,019.00 0.26
LAS 7377 1,258.00 0.17
MIA 7071 1,803.00 0.25
BUF 7049 1,300.00 0.18
ATL 6687 1,449.00 0.22
RDU 6665 1,646.00 0.25
SEA 6491 1,375.00 0.21
SJU 5712 1,053.00 0.18
ORD 5647 1,705.00 0.30
PHX 5460 1,021.00 0.19
TPA 5240 1,096.00 0.21
DCA 4930 898.00 0.18
SLC 4329 844.00 0.19
SAN 4317 767.00 0.18
AUS 4257 795.00 0.19
ROC 4181 740.00 0.18
JAX 4143 783.00 0.19
MSY 3707 772.00 0.21
PBI 3688 880.00 0.24
CLE 3681 710.00 0.19
BTV 3674 757.00 0.21
SYR 3479 627.00 0.18
CHS 3338 668.00 0.20
IAD 3267 721.00 0.22
DFW 3231 873.00 0.27
BWI 3179 612.00 0.19
PWM 3155 691.00 0.22
ORF 3102 691.00 0.22
BNA 3004 545.00 0.18
DEN 2911 534.00 0.18
DTW 2911 562.00 0.19
SAV 2847 610.00 0.21
MSP 2806 471.00 0.17
PIT 2686 486.00 0.18
CMH 2575 486.00 0.19
PDX 2528 405.00 0.16
RSW 2122 528.00 0.25
IND 2086 370.00 0.18
RIC 1951 360.00 0.18
SJC 1699 403.00 0.24
CVG 1395 299.00 0.21
LGB 1375 345.00 0.25
BUR 996 260.00 0.26
SAT 994 84.00 0.08
ACK 919 263.00 0.29
BQN 858 200.00 0.23
PHL 851 233.00 0.27
SRQ 764 160.00 0.21
HNL 754 222.00 0.29
SMF 684 101.00 0.15
PSE 653 142.00 0.22
HOU 648 189.00 0.29
ABQ 602 148.00 0.25
ORH 577 96.00 0.17
OAK 555 120.00 0.22
RNO 525 102.00 0.19
STT 478 65.00 0.14
ONT 465 56.00 0.12
BGR 442 67.00 0.15
PSP 405 93.00 0.23
MVY 405 110.00 0.27
DAB 365 78.00 0.21
IAH 192 71.00 0.37
HYA 171 47.00 0.27
EGE 161 46.00 0.29
SNA 68 16.00 0.24
JAC 29 7.00 0.24
ANÁLISIS DE ORIGIN_CITY_NAME CON RESPECTO AL ARR_DEL15
len sum mean
ARR_DEL15 ARR_DEL15 ARR_DEL15
ORIGIN_CITY_NAME
Los Angeles, CA 25345 4,803.00 0.19
San Francisco, CA 15228 3,448.00 0.23
Boston, MA 11201 2,283.00 0.20
Orlando, FL 9450 2,400.00 0.25
Washington, DC 8197 1,619.00 0.20
Charlotte, NC 7769 1,764.00 0.23
Fort Lauderdale, FL 7645 2,019.00 0.26
Las Vegas, NV 7377 1,258.00 0.17
Miami, FL 7071 1,803.00 0.25
Buffalo, NY 7049 1,300.00 0.18
Atlanta, GA 6687 1,449.00 0.22
Raleigh/Durham, NC 6665 1,646.00 0.25
Seattle, WA 6491 1,375.00 0.21
San Juan, PR 5712 1,053.00 0.18
Chicago, IL 5647 1,705.00 0.30
Phoenix, AZ 5460 1,021.00 0.19
Tampa, FL 5240 1,096.00 0.21
Salt Lake City, UT 4329 844.00 0.19
San Diego, CA 4317 767.00 0.18
Austin, TX 4257 795.00 0.19
Rochester, NY 4181 740.00 0.18
Jacksonville, FL 4143 783.00 0.19
New Orleans, LA 3707 772.00 0.21
West Palm Beach/Palm Beach, FL 3688 880.00 0.24
Cleveland, OH 3681 710.00 0.19
Burlington, VT 3674 757.00 0.21
Syracuse, NY 3479 627.00 0.18
Charleston, SC 3338 668.00 0.20
Dallas/Fort Worth, TX 3231 873.00 0.27
Baltimore, MD 3179 612.00 0.19
Portland, ME 3155 691.00 0.22
Norfolk, VA 3102 691.00 0.22
Nashville, TN 3004 545.00 0.18
Detroit, MI 2911 562.00 0.19
Denver, CO 2911 534.00 0.18
Savannah, GA 2847 610.00 0.21
Minneapolis, MN 2806 471.00 0.17
Pittsburgh, PA 2686 486.00 0.18
Columbus, OH 2575 486.00 0.19
Portland, OR 2528 405.00 0.16
Fort Myers, FL 2122 528.00 0.25
Indianapolis, IN 2086 370.00 0.18
Richmond, VA 1951 360.00 0.18
San Jose, CA 1699 403.00 0.24
Cincinnati, OH 1395 299.00 0.21
Long Beach, CA 1375 345.00 0.25
Burbank, CA 996 260.00 0.26
San Antonio, TX 994 84.00 0.08
Nantucket, MA 919 263.00 0.29
Aguadilla, PR 858 200.00 0.23
Philadelphia, PA 851 233.00 0.27
Houston, TX 840 260.00 0.31
Sarasota/Bradenton, FL 764 160.00 0.21
Honolulu, HI 754 222.00 0.29
Sacramento, CA 684 101.00 0.15
Ponce, PR 653 142.00 0.22
Albuquerque, NM 602 148.00 0.25
Worcester, MA 577 96.00 0.17
Oakland, CA 555 120.00 0.22
Reno, NV 525 102.00 0.19
Charlotte Amalie, VI 478 65.00 0.14
Ontario, CA 465 56.00 0.12
Bangor, ME 442 67.00 0.15
Martha's Vineyard, MA 405 110.00 0.27
Palm Springs, CA 405 93.00 0.23
Daytona Beach, FL 365 78.00 0.21
Hyannis, MA 171 47.00 0.27
Eagle, CO 161 46.00 0.29
Santa Ana, CA 68 16.00 0.24
Jackson, WY 29 7.00 0.24
ANÁLISIS DE ORIGIN_STATE_NM CON RESPECTO AL ARR_DEL15
len sum mean
ARR_DEL15 ARR_DEL15 ARR_DEL15
ORIGIN_STATE_NM
California 51137 10,412.00 0.20
Florida 40488 9,747.00 0.24
New York 14709 2,667.00 0.18
North Carolina 14434 3,410.00 0.24
Massachusetts 13273 2,799.00 0.21
Virginia 13250 2,670.00 0.20
Georgia 9534 2,059.00 0.22
Texas 9322 2,012.00 0.22
Nevada 7902 1,360.00 0.17
Puerto Rico 7223 1,395.00 0.19
Washington 6491 1,375.00 0.21
Ohio 6256 1,196.00 0.19
Illinois 5647 1,705.00 0.30
Arizona 5460 1,021.00 0.19
Utah 4329 844.00 0.19
Louisiana 3707 772.00 0.21
Vermont 3674 757.00 0.21
Maine 3597 758.00 0.21
Pennsylvania 3537 719.00 0.20
South Carolina 3338 668.00 0.20
Maryland 3179 612.00 0.19
Colorado 3072 580.00 0.19
Tennessee 3004 545.00 0.18
Michigan 2911 562.00 0.19
Minnesota 2806 471.00 0.17
Oregon 2528 405.00 0.16
Indiana 2086 370.00 0.18
Kentucky 1395 299.00 0.21
Hawaii 754 222.00 0.29
New Mexico 602 148.00 0.25
U.S. Virgin Islands 478 65.00 0.14
Wyoming 29 7.00 0.24
Vemos varias cosas importantes para el target:
A.- VARIABLES CATEGÓRICAS
# Listamos los nulos de las variables categóricas
jfk.select_dtypes(include = ['object']).isnull().sum()
TAIL_NUM 15009 ORIGIN 0 ORIGIN_CITY_NAME 0 ORIGIN_STATE_NM 0 dtype: int64
OP_CARRIER_AIRLINE_ID
jfk['OP_CARRIER_AIRLINE_ID'].dtype
dtype('float64')
Se trata de un número que no tiene ningún significado, más allá que el de identificar de forma única a la compañía aerea. Por lo tanto, convertimos esta variable en una categórica.
# Convertimos en categorica
jfk['OP_CARRIER_AIRLINE_ID'] = jfk['OP_CARRIER_AIRLINE_ID'].astype(object)
jfk['OP_CARRIER_AIRLINE_ID'].dtype
dtype('O')
# Cantidad de compañías aereas
jfk['OP_CARRIER_AIRLINE_ID'].nunique()
11
jfk['OP_CARRIER_AIRLINE_ID'].value_counts()
20,409.00 78587 19,790.00 58894 20,363.00 40833 19,805.00 30694 20,398.00 8814 19,930.00 8591 20,452.00 5044 20,304.00 3993 21,171.00 944 19,690.00 695 20,397.00 555 Name: OP_CARRIER_AIRLINE_ID, dtype: int64
# Matrícula de cola de los aviones que tiene la compañia 20409 y número de vuelos de cada uno de ellos
jfk[jfk['OP_CARRIER_AIRLINE_ID']==20409]['TAIL_NUM'].value_counts()
N959JB 569
N957JB 557
N970JB 552
N948JB 541
N907JB 535
...
N527JL 18
N2029J 17
N2038J 10
N977JB 8
N2017J 4
Name: TAIL_NUM, Length: 303, dtype: int64
El avión que más vuelos realiza es el N959JB
# Nulos que tiene
jfk[jfk['OP_CARRIER_AIRLINE_ID'].isnull()]
| FL_DATE | OP_CARRIER_AIRLINE_ID | TAIL_NUM | ORIGIN | ORIGIN_CITY_NAME | ORIGIN_STATE_NM | CRS_DEP_TIME | TAXI_OUT | CRS_ARR_TIME | ARR_DEL15 | CRS_ELAPSED_TIME | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 41 | 2018-02-01 | NaN | N845NN | MCO | Orlando, FL | Florida | 1320 | NaN | 1603 | 0.00 | 163.00 |
| 44 | 2018-02-01 | NaN | N897NN | CLT | Charlotte, NC | North Carolina | 1603 | 13.00 | 1804 | 0.00 | 121.00 |
| 87 | 2018-02-01 | NaN | N314NB | MSY | New Orleans, LA | Louisiana | 1140 | 10.00 | 1540 | 0.00 | 180.00 |
| 89 | 2018-02-01 | NaN | N187DN | SFO | San Francisco, CA | California | 830 | 27.00 | 1710 | 0.00 | 340.00 |
| 117 | 2018-02-01 | NaN | NaN | SAN | San Diego, CA | California | 2247 | 12.00 | 705 | 0.00 | 318.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 250056 | 2018-10-31 | NaN | N996JB | MCO | Orlando, FL | Florida | 1757 | 18.00 | 2032 | 0.00 | 155.00 |
| 250058 | 2018-10-31 | NaN | N905JB | MCO | Orlando, FL | Florida | 1958 | 16.00 | 2228 | 0.00 | 150.00 |
| 250069 | 2018-10-31 | NaN | NaN | LAX | Los Angeles, CA | California | 2114 | 13.00 | 532 | 0.00 | 318.00 |
| 250094 | 2018-10-31 | NaN | N280SY | ORD | Chicago, IL | Illinois | 1200 | 12.00 | 1526 | 0.00 | 146.00 |
| 250126 | 2018-10-31 | NaN | N806AE | CLE | Cleveland, OH | Ohio | 1337 | 10.00 | 1519 | 0.00 | 102.00 |
12508 rows × 11 columns
Para imputar los nulos nos ayudamos de fuentes externas:
Por ejemplo, el avión con TAIL_NUM N115NN, es un Airbus A321, con número de serie 6063, que pertenece a la compañía aerea American Airlines y cuyo código identificativo es AA (IATA code) y AAL (ICAO code). En cambio el dataset nos da un código numérico, por lo que se deduce que estos códigos son inventados.
# cantidad de aviones
jfk['TAIL_NUM'].unique()
array(['N299PQ', 'N920XJ', 'N605LR', ..., 'N226JS', 'N251PS', 'N945DN'],
dtype=object)
Por otro lado, si fuesen reales, el tratar de imputar los nulos de esta variable buscando con cada uno de los códigos de los aviones a qué compañía pertenecen es inviable, dado que tenemos 2645 aviones diferentes en el dataset.
Al rellenar nulos de una variable categórica, hay que tener en cuenta que las variables categóricas se convierten con One Hot Encoder normalmente. Si creemos que los nulos pueden contener información relevante, los imputamos con un valor único. Si no queremos mantener esta información, no es necesario realizar ningún tratamiento de nulos, ya que al hacer el OHE simplemente no se creará una columna específica para ellos.
Por lo tanto tenemos varias posibilidades:
En este caso, como vamos a utilizar modelos con DecisionTree, imputamos los nulos con un valor extremo y único 'ZZZ'. Los outliers no afectan a los DecisionTree, en cambio sí afectan, y mucho, a las Regresiones.
pd.DataFrame.fillna?
Signature: pd.DataFrame.fillna( self, value: 'object | ArrayLike | None' = None, method: 'FillnaOptions | None' = None, axis: 'Axis | None' = None, inplace: 'bool' = False, limit=None, downcast=None, ) -> 'DataFrame | None' Docstring: Fill NA/NaN values using the specified method. Parameters ---------- value : scalar, dict, Series, or DataFrame Value to use to fill holes (e.g. 0), alternately a dict/Series/DataFrame of values specifying which value to use for each index (for a Series) or column (for a DataFrame). Values not in the dict/Series/DataFrame will not be filled. This value cannot be a list. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use next valid observation to fill gap. axis : {0 or 'index', 1 or 'columns'} Axis along which to fill missing values. inplace : bool, default False If True, fill in-place. Note: this will modify any other views on this object (e.g., a no-copy slice for a column in a DataFrame). limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is a gap with more than this number of consecutive NaNs, it will only be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. downcast : dict, default is None A dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). Returns ------- DataFrame or None Object with missing values filled or None if ``inplace=True``. See Also -------- interpolate : Fill NaN values using interpolation. reindex : Conform object to new index. asfreq : Convert TimeSeries to specified frequency. Examples -------- >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], ... [np.nan, np.nan, np.nan, np.nan], ... [np.nan, 3, np.nan, 4]], ... columns=list("ABCD")) >>> df A B C D 0 NaN 2.0 NaN 0.0 1 3.0 4.0 NaN 1.0 2 NaN NaN NaN NaN 3 NaN 3.0 NaN 4.0 Replace all NaN elements with 0s. >>> df.fillna(0) A B C D 0 0.0 2.0 0.0 0.0 1 3.0 4.0 0.0 1.0 2 0.0 0.0 0.0 0.0 3 0.0 3.0 0.0 4.0 We can also propagate non-null values forward or backward. >>> df.fillna(method="ffill") A B C D 0 NaN 2.0 NaN 0.0 1 3.0 4.0 NaN 1.0 2 3.0 4.0 NaN 1.0 3 3.0 3.0 NaN 4.0 Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1, 2, and 3 respectively. >>> values = {"A": 0, "B": 1, "C": 2, "D": 3} >>> df.fillna(value=values) A B C D 0 0.0 2.0 2.0 0.0 1 3.0 4.0 2.0 1.0 2 0.0 1.0 2.0 3.0 3 0.0 3.0 2.0 4.0 Only replace the first NaN element. >>> df.fillna(value=values, limit=1) A B C D 0 0.0 2.0 2.0 0.0 1 3.0 4.0 NaN 1.0 2 NaN 1.0 NaN 3.0 3 NaN 3.0 NaN 4.0 When filling using a DataFrame, replacement happens along the same column names and same indices >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE")) >>> df.fillna(df2) A B C D 0 0.0 2.0 0.0 0.0 1 3.0 4.0 0.0 1.0 2 0.0 0.0 0.0 NaN 3 0.0 3.0 0.0 4.0 Note that column D is not affected since it is not present in df2. File: c:\users\jagui\anaconda3\lib\site-packages\pandas\core\frame.py Type: function
jfk['OP_CARRIER_AIRLINE_ID'].fillna(value = 'ZZZ', inplace = True)
jfk['OP_CARRIER_AIRLINE_ID'].value_counts()
20409.0 78587 19790.0 58894 20363.0 40833 19805.0 30694 ZZZ 12508 20398.0 8814 19930.0 8591 20452.0 5044 20304.0 3993 21171.0 944 19690.0 695 20397.0 555 Name: OP_CARRIER_AIRLINE_ID, dtype: int64
TAIL_NUM
jfk['TAIL_NUM'].nunique()
2645
jfk['TAIL_NUM'].value_counts(dropna = False)
NaN 15009
N110AN 761
N113AN 751
N111ZM 748
N115NN 740
...
N197UW 1
N986AN 1
N315RJ 1
N801AW 1
N945DN 1
Name: TAIL_NUM, Length: 2646, dtype: int64
Tenemos 2645 aviones, cuyo número identificativo está formado por tres partes:
Imputamos los nulos con un valor extremo y único 'ZZZ'
jfk['TAIL_NUM'].fillna(value = 'ZZZ', inplace = True)
jfk['TAIL_NUM'].value_counts()
ZZZ 15009
N110AN 761
N113AN 751
N111ZM 748
N115NN 740
...
N197UW 1
N986AN 1
N315RJ 1
N801AW 1
N945DN 1
Name: TAIL_NUM, Length: 2646, dtype: int64
B.- VARIABLES NUMÉRICAS
# Listamos los nulos de las variables numéricas
jfk.select_dtypes(include = ['number']).isnull().sum()
CRS_DEP_TIME 0 TAXI_OUT 17511 CRS_ARR_TIME 0 ARR_DEL15 0 CRS_ELAPSED_TIME 0 dtype: int64
TAXI_OUT
jfk['TAXI_OUT'].value_counts(dropna = False)
13.00 18840
12.00 18344
14.00 18119
NaN 17511
15.00 16614
...
146.00 1
161.00 1
159.00 1
140.00 1
156.00 1
Name: TAXI_OUT, Length: 155, dtype: int64
En las variables numéricas, podemos imputar con cualquier valor:
O bien podemos eliminar los registros que los contienen.
En nuestro caso, imputaremos con una valor extremo, ya que al trabajar con Decision Tress no le va afectar.
Otra cosa es si trabajamos con regresiones lineales, ya que funcionan mal con valores extremos.
jfk['TAXI_OUT'].fillna(value = -999, inplace = True)
jfk['TAXI_OUT'].value_counts()
13.00 18840
12.00 18344
14.00 18119
-999.00 17511
15.00 16614
...
146.00 1
161.00 1
159.00 1
140.00 1
156.00 1
Name: TAXI_OUT, Length: 155, dtype: int64
# Comprobamos que todos los nulos están imputados
jfk.isnull().sum()
FL_DATE 0 OP_CARRIER_AIRLINE_ID 0 TAIL_NUM 0 ORIGIN 0 ORIGIN_CITY_NAME 0 ORIGIN_STATE_NM 0 CRS_DEP_TIME 0 TAXI_OUT 0 CRS_ARR_TIME 0 ARR_DEL15 0 CRS_ELAPSED_TIME 0 dtype: int64
Extraemos información relevante de las fechas (día, mes, año, ...) y la guardamos en nuevas variables que van a ser más fáciles de trabajar con ellas en el modelo.
jfk['SEASON'] = jfk['FL_DATE'].dt.quarter
jfk['YEAR'] = jfk['FL_DATE'].dt.year
jfk['MONTH'] = jfk['FL_DATE'].dt.month
jfk['WEEK'] = jfk['FL_DATE'].dt.week
jfk['WEEKDAY'] = jfk['FL_DATE'].dt.weekday
jfk.sample(1).T
| 4155 | |
|---|---|
| FL_DATE | 2018-02-14 00:00:00 |
| OP_CARRIER_AIRLINE_ID | 20,409.00 |
| TAIL_NUM | N961JB |
| ORIGIN | LAX |
| ORIGIN_CITY_NAME | Los Angeles, CA |
| ORIGIN_STATE_NM | California |
| CRS_DEP_TIME | 1415 |
| TAXI_OUT | 17.00 |
| CRS_ARR_TIME | 2229 |
| ARR_DEL15 | 0.00 |
| CRS_ELAPSED_TIME | 314.00 |
| SEASON | 1 |
| YEAR | 2018 |
| MONTH | 2 |
| WEEK | 7 |
| WEEKDAY | 2 |
Es importante revisar el orden con el que genera el día de la semana ya que la semana puede empezar en lunes o en domingo.
Cogemos un registro cualquiera del dataset para checkear la fecha. El 6 de Octubre de 2019 fue Domingo, que equivale al weeekday igual a 6. Eso significa que la semana empieza con 0 (el lunes es un 0 en el weekday):
jfk['WEEKDAY'].value_counts(dropna=False)
0 37183 4 36848 3 36679 6 36288 1 35945 2 35443 5 31766 Name: WEEKDAY, dtype: int64
Si detectamos que hay mayores retrasos en fin de semana, podemos generar una variable WEEKEND. De esta forma le estamos mostrando al módelo que días son fin de semana.
Los Decision Trees toman cada variable y las divide en dos grupos a partir de un valor en concreto (1, 2, ...), de forma que un grupo se parezca lo más posible al target (grupo puro) y el otro no (grupo impuro).
En nuestro caso, al estar codificado el lunes como 0 y domingo como 6, al modelo le da igual si hacemos la transformación WEEKEND (WEEKDAY>=5) porque tanto si la hacemos como si no, necesita un único corte en ambos casos.
En cambio si la semana empezase en domingo (codificado como 0) y terminase en sábado (codificado como 6), sí que sería útil hacer la transformación WEEKEND, ya que en este caso el modelo necesitaría dos cortes para discriminar los fines de semana.
# Genero la variable booleana WEEKEND
jfk['WEEKEND'] = (jfk['WEEKDAY']>=5).astype(int) # 1 si es fin de semana y 0 si no lo es
jfk['WEEKEND'].value_counts()
0 182098 1 68054 Name: WEEKEND, dtype: int64
jfk['SEASON'].value_counts()
3 64557 2 64008 4 62591 1 58996 Name: SEASON, dtype: int64
jfk['YEAR'].value_counts()
2018 125483 2019 124669 Name: YEAR, dtype: int64
jfk['MONTH'].value_counts()
8 21984 7 21862 5 21712 6 21595 10 21378 12 21082 3 21011 9 20711 4 20701 11 20131 1 19489 2 18496 Name: MONTH, dtype: int64
Como ya hemos extraido toda la información de FL_DATE, podemos eliminarla.
# Elimino la fecha
jfk.drop('FL_DATE', inplace=True, axis=1)
jfk.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| OP_CARRIER_AIRLINE_ID | 20,363.00 | 20,363.00 | 20,363.00 | 20,363.00 | 20,363.00 |
| TAIL_NUM | N299PQ | N920XJ | N605LR | N800AY | N600LR |
| ORIGIN | BNA | RDU | BUF | ORF | BOS |
| ORIGIN_CITY_NAME | Nashville, TN | Raleigh/Durham, NC | Buffalo, NY | Norfolk, VA | Boston, MA |
| ORIGIN_STATE_NM | Tennessee | North Carolina | New York | Virginia | Massachusetts |
| CRS_DEP_TIME | 1359 | 1354 | 1529 | 1704 | 1325 |
| TAXI_OUT | 27.00 | 37.00 | 21.00 | 36.00 | 20.00 |
| CRS_ARR_TIME | 1719 | 1541 | 1709 | 1845 | 1454 |
| ARR_DEL15 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| CRS_ELAPSED_TIME | 140.00 | 107.00 | 100.00 | 101.00 | 89.00 |
| SEASON | 1 | 1 | 1 | 1 | 1 |
| YEAR | 2018 | 2018 | 2018 | 2018 | 2018 |
| MONTH | 2 | 2 | 2 | 2 | 2 |
| WEEK | 5 | 5 | 5 | 5 | 5 |
| WEEKDAY | 3 | 3 | 3 | 3 | 3 |
| WEEKEND | 0 | 0 | 0 | 0 | 0 |
jfk.info(verbose = True)
<class 'pandas.core.frame.DataFrame'> Int64Index: 250152 entries, 0 to 250151 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 OP_CARRIER_AIRLINE_ID 250152 non-null object 1 TAIL_NUM 250152 non-null object 2 ORIGIN 250152 non-null object 3 ORIGIN_CITY_NAME 250152 non-null object 4 ORIGIN_STATE_NM 250152 non-null object 5 CRS_DEP_TIME 250152 non-null int64 6 TAXI_OUT 250152 non-null float64 7 CRS_ARR_TIME 250152 non-null int64 8 ARR_DEL15 250152 non-null float64 9 CRS_ELAPSED_TIME 250152 non-null float64 10 SEASON 250152 non-null int64 11 YEAR 250152 non-null int64 12 MONTH 250152 non-null int64 13 WEEK 250152 non-null int64 14 WEEKDAY 250152 non-null int64 15 WEEKEND 250152 non-null int32 dtypes: float64(3), int32(1), int64(7), object(5) memory usage: 31.5+ MB
Nuestro objetivo es predecir si un vuelo se retrasa más de 15 minutos. Como los modelos se basan en operaciones matemáticas, todas las variables que le pasemos deben estar en formato numérico, por lo que debemos transformar las variables categóricas en numéricas.
Las variables categóricas pueden ser nominales (no se puede establecer un orden entre sus categorías o etiquetas) u ordinales (sus categorías están ordenadas jerárquicamente).
jfk.describe(include='object').T
| count | unique | top | freq | |
|---|---|---|---|---|
| OP_CARRIER_AIRLINE_ID | 250,152.00 | 12.00 | 20,409.00 | 78,587.00 |
| TAIL_NUM | 250152 | 2646 | ZZZ | 15009 |
| ORIGIN | 250152 | 72 | LAX | 25345 |
| ORIGIN_CITY_NAME | 250152 | 70 | Los Angeles, CA | 25345 |
| ORIGIN_STATE_NM | 250152 | 32 | California | 51137 |
Para ello disponemos de diferentes técnicas para codificar las variables categóricas:
jfk['ORIGIN_STATE_NM'].value_counts().sort_values(ascending = True)
Wyoming 29 U.S. Virgin Islands 478 New Mexico 602 Hawaii 754 Kentucky 1395 Indiana 2086 Oregon 2528 Minnesota 2806 Michigan 2911 Tennessee 3004 Colorado 3072 Maryland 3179 South Carolina 3338 Pennsylvania 3537 Maine 3597 Vermont 3674 Louisiana 3707 Utah 4329 Arizona 5460 Illinois 5647 Ohio 6256 Washington 6491 Puerto Rico 7223 Nevada 7902 Texas 9322 Georgia 9534 Virginia 13250 Massachusetts 13273 North Carolina 14434 New York 14709 Florida 40488 California 51137 Name: ORIGIN_STATE_NM, dtype: int64
jfk['ORIGIN_STATE_NM'].value_counts().sort_values(ascending = True).index.to_list()
['Wyoming', 'U.S. Virgin Islands', 'New Mexico', 'Hawaii', 'Kentucky', 'Indiana', 'Oregon', 'Minnesota', 'Michigan', 'Tennessee', 'Colorado', 'Maryland', 'South Carolina', 'Pennsylvania', 'Maine', 'Vermont', 'Louisiana', 'Utah', 'Arizona', 'Illinois', 'Ohio', 'Washington', 'Puerto Rico', 'Nevada', 'Texas', 'Georgia', 'Virginia', 'Massachusetts', 'North Carolina', 'New York', 'Florida', 'California']
# Guardamos las categorías en una variable, ordenadas de menor a mayor
categories_origin_state = jfk['ORIGIN_STATE_NM'].value_counts().sort_values(ascending = True).index.to_list()
# Creamos el codificador indicandole el orden de las variables
encoder = OrdinalEncoder(categories = [categories_origin_state])
encoder
OrdinalEncoder(categories=[['Wyoming', 'U.S. Virgin Islands', 'New Mexico',
'Hawaii', 'Kentucky', 'Indiana', 'Oregon',
'Minnesota', 'Michigan', 'Tennessee', 'Colorado',
'Maryland', 'South Carolina', 'Pennsylvania',
'Maine', 'Vermont', 'Louisiana', 'Utah', 'Arizona',
'Illinois', 'Ohio', 'Washington', 'Puerto Rico',
'Nevada', 'Texas', 'Georgia', 'Virginia',
'Massachusetts', 'North Carolina', 'New York', ...]])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. OrdinalEncoder(categories=[['Wyoming', 'U.S. Virgin Islands', 'New Mexico',
'Hawaii', 'Kentucky', 'Indiana', 'Oregon',
'Minnesota', 'Michigan', 'Tennessee', 'Colorado',
'Maryland', 'South Carolina', 'Pennsylvania',
'Maine', 'Vermont', 'Louisiana', 'Utah', 'Arizona',
'Illinois', 'Ohio', 'Washington', 'Puerto Rico',
'Nevada', 'Texas', 'Georgia', 'Virginia',
'Massachusetts', 'North Carolina', 'New York', ...]])# Ajustamos el codificador con la variable que queremos codificar y la transformamos
encoder.fit(jfk[['ORIGIN_STATE_NM']])
jfk['ORIGIN_STATE_NM_ENCODED'] = encoder.transform(jfk[['ORIGIN_STATE_NM']])
#Valores antes y después de la codificación
jfk[['ORIGIN_STATE_NM', 'ORIGIN_STATE_NM_ENCODED']].head()
| ORIGIN_STATE_NM | ORIGIN_STATE_NM_ENCODED | |
|---|---|---|
| 0 | Tennessee | 9.00 |
| 1 | North Carolina | 28.00 |
| 2 | New York | 29.00 |
| 3 | Virginia | 26.00 |
| 4 | Massachusetts | 27.00 |
# Borramos la variable generada ya que no usaremos este método
del jfk['ORIGIN_STATE_NM_ENCODED']
pd.get_dummies?
Signature: pd.get_dummies( data, prefix=None, prefix_sep='_', dummy_na: 'bool' = False, columns=None, sparse: 'bool' = False, drop_first: 'bool' = False, dtype: 'Dtype | None' = None, ) -> 'DataFrame' Docstring: Convert categorical variable into dummy/indicator variables. Parameters ---------- data : array-like, Series, or DataFrame Data of which to get dummy indicators. prefix : str, list of str, or dict of str, default None String to append DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. prefix_sep : str, default '_' If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix`. dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. sparse : bool, default False Whether the dummy-encoded columns should be backed by a :class:`SparseArray` (True) or a regular NumPy array (False). drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. dtype : dtype, default np.uint8 Data type for new columns. Only a single dtype is allowed. Returns ------- DataFrame Dummy-coded data. See Also -------- Series.str.get_dummies : Convert Series to dummy codes. Notes ----- Reference :ref:`the user guide <reshaping.dummies>` for more examples. Examples -------- >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> pd.get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c 0 1 1 0 0 1 0 1 2 0 1 1 0 0 2 3 1 0 0 0 1 >>> pd.get_dummies(pd.Series(list('abcaa'))) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 4 1 0 0 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) b c 0 0 0 1 1 0 2 0 1 3 0 0 4 0 0 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) a b c 0 1.0 0.0 0.0 1 0.0 1.0 0.0 2 0.0 0.0 1.0 File: c:\users\jagui\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py Type: function
# Creamos las variables binarias usando una función de pandas
dummies_dataframe_ORIGIN_STATE_NM = pd.get_dummies(
data = jfk['ORIGIN_STATE_NM'], # variable a codificar
prefix = 'ORIGIN_STATE_NM', # Prefijo de las nuevas variables
prefix_sep = '_',
drop_first = False, # No eliminamos la primera categoría
)
dummies_dataframe_ORIGIN_STATE_NM.head(10).T
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| ORIGIN_STATE_NM_Arizona | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_California | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Colorado | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Florida | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Georgia | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Hawaii | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Illinois | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Indiana | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Kentucky | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Louisiana | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Maine | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Maryland | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
| ORIGIN_STATE_NM_Massachusetts | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Michigan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Minnesota | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Nevada | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_New Mexico | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_New York | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_North Carolina | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Ohio | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Oregon | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Pennsylvania | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Puerto Rico | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_South Carolina | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Tennessee | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Texas | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_U.S. Virgin Islands | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Utah | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Vermont | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Virginia | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| ORIGIN_STATE_NM_Washington | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ORIGIN_STATE_NM_Wyoming | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
El siguiente paso es unir el dummies a nuestro dataframe y finalmente eliminar la variable de origen ORIGIN_STATE_NM:
jfk['ORIGIN_STATE_NM'] = pd.concat([jfk, dummies_dataframe_ORIGIN_STATE_NM])
# Instanciamos la clase
le = LabelEncoder()
le
LabelEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LabelEncoder()
# Entrenamos el codificador con el método fit y transformamos la variable
le.fit(jfk['TAIL_NUM'])
jfk['TAIL_NUM_le'] = le.transform(jfk['TAIL_NUM'])
jfk['TAIL_NUM_le'].head()
0 467 1 2224 2 1258 3 1651 4 1248 Name: TAIL_NUM_le, dtype: int32
# Obtenemos el mismo resultado haciéndolo todo junto
jfk['TAIL_NUM_le'] = le.fit_transform(jfk['TAIL_NUM'])
jfk['TAIL_NUM_le'].head()
0 467 1 2224 2 1258 3 1651 4 1248 Name: TAIL_NUM_le, dtype: int32
# Comprobamos que están todos los aviones
jfk['TAIL_NUM_le'].nunique()
2646
Una vez entrenado el codificador, podemos acceder a sus clases
le.classes_
array(['8805', '8809', 'N101DU', ..., 'N999JB', 'N999JQ', 'ZZZ'],
dtype=object)
El valor numérico por el que se sustituye a cada una de estas categorías es el índice que ocupa en el array anterior. El avión 8805 tiene un índice 0, por lo que debería ser sustituido por un 0. Lo comprobamos:
jfk[jfk['TAIL_NUM_le'] == 0]['TAIL_NUM']
214806 8805 215526 8805 215527 8805 Name: TAIL_NUM, dtype: object
Eliminamos la variable creada, ya que no utilizaremos esta técnica
del jfk['TAIL_NUM_le']
En aquellas variable categóricas que no tengan un orden y no tengan muchas etiquetas, podemos realizar un One Hot Encoding.
Veamos qué tecnicas de codificación emplearemos
jfk.select_dtypes(include=['object']).describe().T
| count | unique | top | freq | |
|---|---|---|---|---|
| OP_CARRIER_AIRLINE_ID | 250,152.00 | 12.00 | 20,409.00 | 78,587.00 |
| TAIL_NUM | 250152 | 2646 | ZZZ | 15009 |
| ORIGIN | 250152 | 72 | LAX | 25345 |
| ORIGIN_CITY_NAME | 250152 | 70 | Los Angeles, CA | 25345 |
| ORIGIN_STATE_NM | 250152 | 32 | California | 51137 |
Todas nuestras variables categóricas son candidatas a ser codificadas mediante un One Hot:
Entre cuatro variables se generarán 186 variables nuevas, que no supone ningún problema para el Decision Tress y tampoco nos va a dar problemas de memoria en nuestro equipo.
En cambio la variable TAIL_NUMBER va a generar demasiadas variables nuevas. Debemos tratarla de otra manera.
Como regla general usaremos la siguiente:
TAIL_NUMBER
Analizamos la correlación que existe entre la variable y el target. Para ello utilizamos un pivot_table y ordenamos los registros en función del número de vuelos que realiza cada avión.
jfk.pivot_table(index = 'TAIL_NUM', values = target, aggfunc = [len, sum, np.mean]).sort_values(by = [('len', target)], ascending = False)
| len | sum | mean | |
|---|---|---|---|
| ARR_DEL15 | ARR_DEL15 | ARR_DEL15 | |
| TAIL_NUM | |||
| ZZZ | 15009 | 3,109.00 | 0.21 |
| N110AN | 761 | 164.00 | 0.22 |
| N113AN | 751 | 134.00 | 0.18 |
| N111ZM | 748 | 162.00 | 0.22 |
| N109NN | 740 | 118.00 | 0.16 |
| ... | ... | ... | ... |
| N745VJ | 1 | 0.00 | 0.00 |
| N744P | 1 | 0.00 | 0.00 |
| N734AR | 1 | 0.00 | 0.00 |
| N733AR | 1 | 0.00 | 0.00 |
| N719AN | 1 | 0.00 | 0.00 |
2646 rows × 3 columns
((jfk['TAIL_NUM'] == 'N110AN').value_counts()[1])/(jfk.shape[0])*100
0.30421503725734755
El avión con mayor número de registros es el N110AN con 761, que tan sólo representa un 0,3% de todos los vuelos. Hacer una codificación One Hot de esta variable va a aportar muy poca información al modelo. Aplicaremos la técnica del Frecuency Encoding
Vamos a hacer un conteo de el número de vuelos que realiza cada avión y sustituiremos el valor del número de vuelos por cada avión correspondiente. Con esto conseguimos sustituir la variable categórica por una de numérica. Además, mantenemos cierta información del avión, el número de vuelos puede tener relación con la antigüedad y uso del avión. En consecuencia, aviones que se utilizan más podrían ser más propensos a sufrir averias. Esto son solo hipótesis, dejaremos después que el algoritmo decida si es una buena variable.
# Frecuency Encoding de TAIL_NUM
df_TAIL_NUM = pd.DataFrame (jfk['TAIL_NUM'].value_counts(dropna = False)) # Hago un conteo
df_TAIL_NUM.columns = ['TAIL_NUM_COUNT'] # Renombro la columna
df_TAIL_NUM['TAIL_NUM'] = df_TAIL_NUM.index # Añado la columna TAIL_NUM al df, porque me va a servir para hacer el merge posterior
jfk = jfk.merge(df_TAIL_NUM, on = 'TAIL_NUM') # Hacemos un merge con el dataframe original
jfk.drop('TAIL_NUM', axis = 1, inplace = True) # Eliminamos la variable original
jfk.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| OP_CARRIER_AIRLINE_ID | 20,363.00 | 20,363.00 | 20,363.00 | 20,363.00 | 20,363.00 |
| ORIGIN | BNA | PIT | BOS | PHL | CHS |
| ORIGIN_CITY_NAME | Nashville, TN | Pittsburgh, PA | Boston, MA | Philadelphia, PA | Charleston, SC |
| ORIGIN_STATE_NM | Tennessee | Pennsylvania | Massachusetts | Pennsylvania | South Carolina |
| CRS_DEP_TIME | 1359 | 1021 | 1325 | 600 | 1715 |
| TAXI_OUT | 27.00 | 11.00 | 31.00 | 19.00 | 11.00 |
| CRS_ARR_TIME | 1719 | 1158 | 1454 | 711 | 1930 |
| ARR_DEL15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| CRS_ELAPSED_TIME | 140.00 | 97.00 | 89.00 | 71.00 | 135.00 |
| SEASON | 1 | 1 | 1 | 1 | 1 |
| YEAR | 2018 | 2018 | 2018 | 2018 | 2018 |
| MONTH | 2 | 2 | 2 | 2 | 2 |
| WEEK | 5 | 5 | 6 | 6 | 6 |
| WEEKDAY | 3 | 5 | 1 | 3 | 6 |
| WEEKEND | 0 | 1 | 0 | 0 | 1 |
| TAIL_NUM_COUNT | 340 | 340 | 340 | 340 | 340 |
Sobre el resto de variables categóricas, aplicaremos un OHE que nos generará un total de 186 nuevas columnas.
jfk.select_dtypes(include=['object']).describe().T
| count | unique | top | freq | |
|---|---|---|---|---|
| OP_CARRIER_AIRLINE_ID | 250,152.00 | 12.00 | 20,409.00 | 78,587.00 |
| ORIGIN | 250152 | 72 | LAX | 25345 |
| ORIGIN_CITY_NAME | 250152 | 70 | Los Angeles, CA | 25345 |
| ORIGIN_STATE_NM | 250152 | 32 | California | 51137 |
list_OHE = ['OP_CARRIER_AIRLINE_ID','ORIGIN', 'ORIGIN_CITY_NAME', 'ORIGIN_STATE_NM']
# genera las nuevas variables y elimina al mismo tiempo las antiguas
jfk = pd.get_dummies(data = jfk, prefix=None, prefix_sep="_", columns = list_OHE, drop_first = False)
O bien, podríamos aplicar el OHE a cada una de las variables por separado, aplicando la siguiente función:
def OHE(dataframe, column_name):
dummies_df= pd.get_dummies(dataframe[column_name], prefix = column_name) # uso el prefix para asegurarme que las etiquetas son únicas
dataframe=pd.concat([dataframe,dummies_df], axis=1) # Concateno el dataframe con el dummies_df
#dataframe.drop(column_name, axis = 1, inplace=True) # Una vez hecho el tratamiento, borro la etiqueta original
return dataframe
# Comprobamos que no queda ninguna variable categórica
jfk.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Int64Index: 250152 entries, 0 to 250151 Columns: 198 entries, CRS_DEP_TIME to ORIGIN_STATE_NM_Wyoming dtypes: float64(3), int32(1), int64(8), uint8(186) memory usage: 68.2 MB
Estandarizar una variable es transformarla de manera que, en lugar de ver los valores reales, vemos que van de 1 a -1, siendo 0 la media, y 1 y -1 los valores que corresponderia a sumar o resta a la media la máxima desviacion típica (que no necesariamente son los valores máximos y mínimos). Existen diferentes formas de hacerlo:
from scipy import stats
zscore_df_1 = stats.zscore(jfk, axis=1)
zscore_df_1.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| CRS_DEP_TIME | 6.29 | 5.52 | 6.49 | 3.66 | 7.25 |
| TAXI_OUT | -0.01 | -0.07 | 0.02 | -0.00 | -0.09 |
| CRS_ARR_TIME | 8.00 | 6.28 | 7.13 | 4.36 | 8.18 |
| ARR_DEL15 | -0.13 | -0.13 | -0.13 | -0.12 | -0.13 |
| CRS_ELAPSED_TIME | 0.53 | 0.41 | 0.31 | 0.33 | 0.45 |
| ... | ... | ... | ... | ... | ... |
| ORIGIN_STATE_NM_Utah | -0.13 | -0.13 | -0.13 | -0.12 | -0.13 |
| ORIGIN_STATE_NM_Vermont | -0.13 | -0.13 | -0.13 | -0.12 | -0.13 |
| ORIGIN_STATE_NM_Virginia | -0.13 | -0.13 | -0.13 | -0.12 | -0.13 |
| ORIGIN_STATE_NM_Washington | -0.13 | -0.13 | -0.13 | -0.12 | -0.13 |
| ORIGIN_STATE_NM_Wyoming | -0.13 | -0.13 | -0.13 | -0.12 | -0.13 |
198 rows × 5 columns
z_scaler = StandardScaler()
jfk_z = z_scaler.fit_transform(jfk) # Me devuelve un array
zscore_df_2 = pd.DataFrame(jfk_z)
zscore_df_2.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| 0 | 0.05 | -0.59 | -0.02 | -1.39 | 0.72 |
| 1 | 0.31 | 0.25 | 0.32 | 0.28 | 0.25 |
| 2 | 0.52 | -0.45 | 0.06 | -1.22 | 0.88 |
| 3 | -0.52 | -0.52 | -0.52 | -0.52 | 1.94 |
| 4 | -0.57 | -1.01 | -1.10 | -1.28 | -0.62 |
| ... | ... | ... | ... | ... | ... |
| 193 | -0.13 | -0.13 | -0.13 | -0.13 | -0.13 |
| 194 | -0.12 | -0.12 | -0.12 | -0.12 | -0.12 |
| 195 | -0.24 | -0.24 | -0.24 | -0.24 | -0.24 |
| 196 | -0.16 | -0.16 | -0.16 | -0.16 | -0.16 |
| 197 | -0.01 | -0.01 | -0.01 | -0.01 | -0.01 |
198 rows × 5 columns
minMaxResultado=MinMaxScaler()
jfk_mm = minMaxResultado.fit_transform(jfk)
jfk_mm_df = pd.DataFrame(jfk_mm)
jfk_mm_df.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| 0 | 0.58 | 0.43 | 0.56 | 0.25 | 0.73 |
| 1 | 0.88 | 0.87 | 0.88 | 0.87 | 0.87 |
| 2 | 0.72 | 0.48 | 0.61 | 0.30 | 0.80 |
| 3 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| 4 | 0.17 | 0.09 | 0.08 | 0.04 | 0.16 |
| ... | ... | ... | ... | ... | ... |
| 193 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 194 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 195 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 196 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 197 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
198 rows × 5 columns
# Función que aplica el MinMax a la variable
def minmaxTransformacion(dataframe, variable_transformar):
minMaxResultado=MinMaxScaler()
dataframe[variable_transformar]=minMaxResultado.fit_transform(dataframe[variable_transformar].values.reshape(-1,1))
return minMaxResultado
Como vamos a trabajar con un modelo Decision Tress no es necesario hacer ningún tratamiento de estandarización.
jfk.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| CRS_DEP_TIME | 1,359.00 | 1,021.00 | 1,325.00 | 600.00 | 1,715.00 |
| TAXI_OUT | 27.00 | 11.00 | 31.00 | 19.00 | 11.00 |
| CRS_ARR_TIME | 1,719.00 | 1,158.00 | 1,454.00 | 711.00 | 1,930.00 |
| ARR_DEL15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| CRS_ELAPSED_TIME | 140.00 | 97.00 | 89.00 | 71.00 | 135.00 |
| ... | ... | ... | ... | ... | ... |
| ORIGIN_STATE_NM_Utah | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Vermont | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Virginia | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Washington | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Wyoming | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
198 rows × 5 columns
jfk.info(verbose = True)
<class 'pandas.core.frame.DataFrame'> Int64Index: 250152 entries, 0 to 250151 Data columns (total 198 columns): # Column Dtype --- ------ ----- 0 CRS_DEP_TIME int64 1 TAXI_OUT float64 2 CRS_ARR_TIME int64 3 ARR_DEL15 float64 4 CRS_ELAPSED_TIME float64 5 SEASON int64 6 YEAR int64 7 MONTH int64 8 WEEK int64 9 WEEKDAY int64 10 WEEKEND int32 11 TAIL_NUM_COUNT int64 12 OP_CARRIER_AIRLINE_ID_19690.0 uint8 13 OP_CARRIER_AIRLINE_ID_19790.0 uint8 14 OP_CARRIER_AIRLINE_ID_19805.0 uint8 15 OP_CARRIER_AIRLINE_ID_19930.0 uint8 16 OP_CARRIER_AIRLINE_ID_20304.0 uint8 17 OP_CARRIER_AIRLINE_ID_20363.0 uint8 18 OP_CARRIER_AIRLINE_ID_20397.0 uint8 19 OP_CARRIER_AIRLINE_ID_20398.0 uint8 20 OP_CARRIER_AIRLINE_ID_20409.0 uint8 21 OP_CARRIER_AIRLINE_ID_20452.0 uint8 22 OP_CARRIER_AIRLINE_ID_21171.0 uint8 23 OP_CARRIER_AIRLINE_ID_ZZZ uint8 24 ORIGIN_ABQ uint8 25 ORIGIN_ACK uint8 26 ORIGIN_ATL uint8 27 ORIGIN_AUS uint8 28 ORIGIN_BGR uint8 29 ORIGIN_BNA uint8 30 ORIGIN_BOS uint8 31 ORIGIN_BQN uint8 32 ORIGIN_BTV uint8 33 ORIGIN_BUF uint8 34 ORIGIN_BUR uint8 35 ORIGIN_BWI uint8 36 ORIGIN_CHS uint8 37 ORIGIN_CLE uint8 38 ORIGIN_CLT uint8 39 ORIGIN_CMH uint8 40 ORIGIN_CVG uint8 41 ORIGIN_DAB uint8 42 ORIGIN_DCA uint8 43 ORIGIN_DEN uint8 44 ORIGIN_DFW uint8 45 ORIGIN_DTW uint8 46 ORIGIN_EGE uint8 47 ORIGIN_FLL uint8 48 ORIGIN_HNL uint8 49 ORIGIN_HOU uint8 50 ORIGIN_HYA uint8 51 ORIGIN_IAD uint8 52 ORIGIN_IAH uint8 53 ORIGIN_IND uint8 54 ORIGIN_JAC uint8 55 ORIGIN_JAX uint8 56 ORIGIN_LAS uint8 57 ORIGIN_LAX uint8 58 ORIGIN_LGB uint8 59 ORIGIN_MCO uint8 60 ORIGIN_MIA uint8 61 ORIGIN_MSP uint8 62 ORIGIN_MSY uint8 63 ORIGIN_MVY uint8 64 ORIGIN_OAK uint8 65 ORIGIN_ONT uint8 66 ORIGIN_ORD uint8 67 ORIGIN_ORF uint8 68 ORIGIN_ORH uint8 69 ORIGIN_PBI uint8 70 ORIGIN_PDX uint8 71 ORIGIN_PHL uint8 72 ORIGIN_PHX uint8 73 ORIGIN_PIT uint8 74 ORIGIN_PSE uint8 75 ORIGIN_PSP uint8 76 ORIGIN_PWM uint8 77 ORIGIN_RDU uint8 78 ORIGIN_RIC uint8 79 ORIGIN_RNO uint8 80 ORIGIN_ROC uint8 81 ORIGIN_RSW uint8 82 ORIGIN_SAN uint8 83 ORIGIN_SAT uint8 84 ORIGIN_SAV uint8 85 ORIGIN_SEA uint8 86 ORIGIN_SFO uint8 87 ORIGIN_SJC uint8 88 ORIGIN_SJU uint8 89 ORIGIN_SLC uint8 90 ORIGIN_SMF uint8 91 ORIGIN_SNA uint8 92 ORIGIN_SRQ uint8 93 ORIGIN_STT uint8 94 ORIGIN_SYR uint8 95 ORIGIN_TPA uint8 96 ORIGIN_CITY_NAME_Aguadilla, PR uint8 97 ORIGIN_CITY_NAME_Albuquerque, NM uint8 98 ORIGIN_CITY_NAME_Atlanta, GA uint8 99 ORIGIN_CITY_NAME_Austin, TX uint8 100 ORIGIN_CITY_NAME_Baltimore, MD uint8 101 ORIGIN_CITY_NAME_Bangor, ME uint8 102 ORIGIN_CITY_NAME_Boston, MA uint8 103 ORIGIN_CITY_NAME_Buffalo, NY uint8 104 ORIGIN_CITY_NAME_Burbank, CA uint8 105 ORIGIN_CITY_NAME_Burlington, VT uint8 106 ORIGIN_CITY_NAME_Charleston, SC uint8 107 ORIGIN_CITY_NAME_Charlotte Amalie, VI uint8 108 ORIGIN_CITY_NAME_Charlotte, NC uint8 109 ORIGIN_CITY_NAME_Chicago, IL uint8 110 ORIGIN_CITY_NAME_Cincinnati, OH uint8 111 ORIGIN_CITY_NAME_Cleveland, OH uint8 112 ORIGIN_CITY_NAME_Columbus, OH uint8 113 ORIGIN_CITY_NAME_Dallas/Fort Worth, TX uint8 114 ORIGIN_CITY_NAME_Daytona Beach, FL uint8 115 ORIGIN_CITY_NAME_Denver, CO uint8 116 ORIGIN_CITY_NAME_Detroit, MI uint8 117 ORIGIN_CITY_NAME_Eagle, CO uint8 118 ORIGIN_CITY_NAME_Fort Lauderdale, FL uint8 119 ORIGIN_CITY_NAME_Fort Myers, FL uint8 120 ORIGIN_CITY_NAME_Honolulu, HI uint8 121 ORIGIN_CITY_NAME_Houston, TX uint8 122 ORIGIN_CITY_NAME_Hyannis, MA uint8 123 ORIGIN_CITY_NAME_Indianapolis, IN uint8 124 ORIGIN_CITY_NAME_Jackson, WY uint8 125 ORIGIN_CITY_NAME_Jacksonville, FL uint8 126 ORIGIN_CITY_NAME_Las Vegas, NV uint8 127 ORIGIN_CITY_NAME_Long Beach, CA uint8 128 ORIGIN_CITY_NAME_Los Angeles, CA uint8 129 ORIGIN_CITY_NAME_Martha's Vineyard, MA uint8 130 ORIGIN_CITY_NAME_Miami, FL uint8 131 ORIGIN_CITY_NAME_Minneapolis, MN uint8 132 ORIGIN_CITY_NAME_Nantucket, MA uint8 133 ORIGIN_CITY_NAME_Nashville, TN uint8 134 ORIGIN_CITY_NAME_New Orleans, LA uint8 135 ORIGIN_CITY_NAME_Norfolk, VA uint8 136 ORIGIN_CITY_NAME_Oakland, CA uint8 137 ORIGIN_CITY_NAME_Ontario, CA uint8 138 ORIGIN_CITY_NAME_Orlando, FL uint8 139 ORIGIN_CITY_NAME_Palm Springs, CA uint8 140 ORIGIN_CITY_NAME_Philadelphia, PA uint8 141 ORIGIN_CITY_NAME_Phoenix, AZ uint8 142 ORIGIN_CITY_NAME_Pittsburgh, PA uint8 143 ORIGIN_CITY_NAME_Ponce, PR uint8 144 ORIGIN_CITY_NAME_Portland, ME uint8 145 ORIGIN_CITY_NAME_Portland, OR uint8 146 ORIGIN_CITY_NAME_Raleigh/Durham, NC uint8 147 ORIGIN_CITY_NAME_Reno, NV uint8 148 ORIGIN_CITY_NAME_Richmond, VA uint8 149 ORIGIN_CITY_NAME_Rochester, NY uint8 150 ORIGIN_CITY_NAME_Sacramento, CA uint8 151 ORIGIN_CITY_NAME_Salt Lake City, UT uint8 152 ORIGIN_CITY_NAME_San Antonio, TX uint8 153 ORIGIN_CITY_NAME_San Diego, CA uint8 154 ORIGIN_CITY_NAME_San Francisco, CA uint8 155 ORIGIN_CITY_NAME_San Jose, CA uint8 156 ORIGIN_CITY_NAME_San Juan, PR uint8 157 ORIGIN_CITY_NAME_Santa Ana, CA uint8 158 ORIGIN_CITY_NAME_Sarasota/Bradenton, FL uint8 159 ORIGIN_CITY_NAME_Savannah, GA uint8 160 ORIGIN_CITY_NAME_Seattle, WA uint8 161 ORIGIN_CITY_NAME_Syracuse, NY uint8 162 ORIGIN_CITY_NAME_Tampa, FL uint8 163 ORIGIN_CITY_NAME_Washington, DC uint8 164 ORIGIN_CITY_NAME_West Palm Beach/Palm Beach, FL uint8 165 ORIGIN_CITY_NAME_Worcester, MA uint8 166 ORIGIN_STATE_NM_Arizona uint8 167 ORIGIN_STATE_NM_California uint8 168 ORIGIN_STATE_NM_Colorado uint8 169 ORIGIN_STATE_NM_Florida uint8 170 ORIGIN_STATE_NM_Georgia uint8 171 ORIGIN_STATE_NM_Hawaii uint8 172 ORIGIN_STATE_NM_Illinois uint8 173 ORIGIN_STATE_NM_Indiana uint8 174 ORIGIN_STATE_NM_Kentucky uint8 175 ORIGIN_STATE_NM_Louisiana uint8 176 ORIGIN_STATE_NM_Maine uint8 177 ORIGIN_STATE_NM_Maryland uint8 178 ORIGIN_STATE_NM_Massachusetts uint8 179 ORIGIN_STATE_NM_Michigan uint8 180 ORIGIN_STATE_NM_Minnesota uint8 181 ORIGIN_STATE_NM_Nevada uint8 182 ORIGIN_STATE_NM_New Mexico uint8 183 ORIGIN_STATE_NM_New York uint8 184 ORIGIN_STATE_NM_North Carolina uint8 185 ORIGIN_STATE_NM_Ohio uint8 186 ORIGIN_STATE_NM_Oregon uint8 187 ORIGIN_STATE_NM_Pennsylvania uint8 188 ORIGIN_STATE_NM_Puerto Rico uint8 189 ORIGIN_STATE_NM_South Carolina uint8 190 ORIGIN_STATE_NM_Tennessee uint8 191 ORIGIN_STATE_NM_Texas uint8 192 ORIGIN_STATE_NM_U.S. Virgin Islands uint8 193 ORIGIN_STATE_NM_Utah uint8 194 ORIGIN_STATE_NM_Vermont uint8 195 ORIGIN_STATE_NM_Virginia uint8 196 ORIGIN_STATE_NM_Washington uint8 197 ORIGIN_STATE_NM_Wyoming uint8 dtypes: float64(3), int32(1), int64(8), uint8(186) memory usage: 68.2 MB
# Ruta actual
print(os.getcwd())
D:\Data_Science\MACHINE_LEARNING\ML_SUPERVISADO\ML_SUPERVISADO_CLASIFICACION\Machine-Learning-Binary-Classifier-JFK-DELAY
# Guardamos el dataset
jfk.to_pickle('pickle/jfk_final.pkl')
jfk.to_csv('final/jfk_final.csv')
# Cargo el modelo
df = pd.read_pickle('pickle/jfk_final.pkl')
df.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| CRS_DEP_TIME | 1,359.00 | 1,021.00 | 1,325.00 | 600.00 | 1,715.00 |
| TAXI_OUT | 27.00 | 11.00 | 31.00 | 19.00 | 11.00 |
| CRS_ARR_TIME | 1,719.00 | 1,158.00 | 1,454.00 | 711.00 | 1,930.00 |
| ARR_DEL15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| CRS_ELAPSED_TIME | 140.00 | 97.00 | 89.00 | 71.00 | 135.00 |
| ... | ... | ... | ... | ... | ... |
| ORIGIN_STATE_NM_Utah | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Vermont | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Virginia | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Washington | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Wyoming | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
198 rows × 5 columns
En este punto iniciamos con la fase de modelización, que ocupa los pasos de generación de la estrategia de validación, el entrenamiento del modelo y el desarrollo de predicciones y la validación de éstas.
Algunas de las estrategias de validación:
Podemos analizar los métodos disponibles con "??"
model_selection??
Type: module String form: <module 'sklearn.model_selection' from 'c:\\Users\\jagui\\anaconda3\\lib\\site-packages\\sklearn\\model_selection\\__init__.py'> File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\model_selection\__init__.py Source: import typing from ._plot import LearningCurveDisplay, ValidationCurveDisplay from ._search import GridSearchCV, ParameterGrid, ParameterSampler, RandomizedSearchCV from ._split import ( BaseCrossValidator, BaseShuffleSplit, GroupKFold, GroupShuffleSplit, KFold, LeaveOneGroupOut, LeaveOneOut, LeavePGroupsOut, LeavePOut, PredefinedSplit, RepeatedKFold, RepeatedStratifiedKFold, ShuffleSplit, StratifiedGroupKFold, StratifiedKFold, StratifiedShuffleSplit, TimeSeriesSplit, check_cv, train_test_split, ) from ._validation import ( cross_val_predict, cross_val_score, cross_validate, learning_curve, permutation_test_score, validation_curve, ) if typing.TYPE_CHECKING: # Avoid errors in type checkers (e.g. mypy) for experimental estimators. # TODO: remove this check once the estimator is no longer experimental. from ._search_successive_halving import ( # noqa HalvingGridSearchCV, HalvingRandomSearchCV, ) __all__ = [ "BaseCrossValidator", "BaseShuffleSplit", "GridSearchCV", "TimeSeriesSplit", "KFold", "GroupKFold", "GroupShuffleSplit", "LeaveOneGroupOut", "LeaveOneOut", "LeavePGroupsOut", "LeavePOut", "RepeatedKFold", "RepeatedStratifiedKFold", "ParameterGrid", "ParameterSampler", "PredefinedSplit", "RandomizedSearchCV", "ShuffleSplit", "StratifiedKFold", "StratifiedGroupKFold", "StratifiedShuffleSplit", "check_cv", "cross_val_predict", "cross_val_score", "cross_validate", "learning_curve", "LearningCurveDisplay", "permutation_test_score", "train_test_split", "validation_curve", "ValidationCurveDisplay", ] # TODO: remove this check once the estimator is no longer experimental. def __getattr__(name): if name in {"HalvingGridSearchCV", "HalvingRandomSearchCV"}: raise ImportError( f"{name} is experimental and the API might change without any " "deprecation cycle. To use it, you need to explicitly import " "enable_halving_search_cv:\n" "from sklearn.experimental import enable_halving_search_cv" ) raise AttributeError(f"module {__name__} has no attribute {name}")
Y vemos que disponemos de métodos para el random holdout (train_test_split) y k-fold (KFold).
A.- Development/Validation
Nuestro dataset está particionado temporalmente, de forma que tenemos un registro único para cada fecha, hora y avión.
En este tipo de datasets, es conveniente guardar las particiones más recientes para hacer la validación del modelo, contrastando que nuestro modelo se pueda generalizar en el tiempo futuro (es de esperar que los retrasos tengan un patrón de comportamiento similar si tomamos como medida el "año").
Para decidir qué partición utilizamos para la validación, vemos el número de registros que hay cada año (hay que recordar que la partición de validación no es aleatoria, debe ser lo más parecido a la realidad)
df['YEAR'].value_counts()
2018 125483 2019 124669 Name: YEAR, dtype: int64
Analizamos también el comportamiento en cada partición año/mes
df.pivot_table(
index = ['YEAR','MONTH'], # Por año y por mes para no mezclar los meses de diferentes años
values = target,
aggfunc = [len, sum, np.mean] # len(nº vuelos), sum(nº vuelos con retraso), mean(promedio de los vuelos con retraso)
)
| len | sum | mean | ||
|---|---|---|---|---|
| ARR_DEL15 | ARR_DEL15 | ARR_DEL15 | ||
| YEAR | MONTH | |||
| 2018 | 1 | 9244 | 1,968.00 | 0.21 |
| 2 | 9115 | 1,418.00 | 0.16 | |
| 3 | 9927 | 1,825.00 | 0.18 | |
| 4 | 10393 | 2,184.00 | 0.21 | |
| 5 | 10915 | 2,478.00 | 0.23 | |
| 6 | 11052 | 2,504.00 | 0.23 | |
| 7 | 11201 | 3,463.00 | 0.31 | |
| 8 | 11202 | 3,345.00 | 0.30 | |
| 9 | 10591 | 2,304.00 | 0.22 | |
| 10 | 10952 | 1,628.00 | 0.15 | |
| 11 | 10194 | 2,138.00 | 0.21 | |
| 12 | 10697 | 2,199.00 | 0.21 | |
| 2019 | 1 | 10245 | 2,154.00 | 0.21 |
| 2 | 9381 | 1,817.00 | 0.19 | |
| 3 | 11084 | 1,812.00 | 0.16 | |
| 4 | 10308 | 2,236.00 | 0.22 | |
| 5 | 10797 | 2,122.00 | 0.20 | |
| 6 | 10543 | 2,743.00 | 0.26 | |
| 7 | 10661 | 2,735.00 | 0.26 | |
| 8 | 10782 | 2,912.00 | 0.27 | |
| 9 | 10120 | 1,544.00 | 0.15 | |
| 10 | 10426 | 1,418.00 | 0.14 | |
| 11 | 9937 | 1,205.00 | 0.12 | |
| 12 | 10385 | 2,480.00 | 0.24 |
Vamos a imaginarnos que queremos hacer una predicción para los primeros meses del 2020. ¿Qué partición podemos hacer?:
Elegimos los registros de los últimos 6 meses del 2019 para realizar la validación (validation split) y el resto para realizar el entrenamiento del modelo (train/test --> development split).
dev_df = jfk[(jfk['YEAR'] == 2018) | (jfk['MONTH'] < 7) ] # development split = train + test
val_df = jfk[(jfk['YEAR'] == 2019) & (jfk['MONTH'] >= 7)] # validation split
# Train/Test
dev_df.shape
(187841, 198)
# Validation
val_df.shape
(62311, 198)
La partición de validación de be tener como mínimo 1000 o 2000 filas, por lo que vamos sobrados (tenemos 62311 filas)
#Compruebo que no me he dejado ningún registro
len(dev_df)+len(val_df)==len(df)
True
Me aseguro de que la distribucion de train y test sean la misma, porque si funciona bien en train y no funciona bien en test, sabré que la causa es que está memorizando (overfiting).
Para hacer esto tengo que separar atributos y target (sin meter el inplace=True porque el original no quiero modificarlo) --> esto lo hago tanto para validacion como para development.
# Dataframe Train/Test
dev_df_X = dev_df.drop(target, axis=1) # no aplico inplace = True
dev_df_y = dev_df[[target]] # ponemos doble claudator ya que usaremos un dataframe
# Dataframe Validation
val_df_X = val_df.drop(target, axis=1) # no aplico inplace = True
val_df_y = val_df[[target]] # ponemos doble claudator ya que usaremos un dataframe
dev_df_X.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| CRS_DEP_TIME | 1,359.00 | 1,021.00 | 1,325.00 | 600.00 | 1,715.00 |
| TAXI_OUT | 27.00 | 11.00 | 31.00 | 19.00 | 11.00 |
| CRS_ARR_TIME | 1,719.00 | 1,158.00 | 1,454.00 | 711.00 | 1,930.00 |
| CRS_ELAPSED_TIME | 140.00 | 97.00 | 89.00 | 71.00 | 135.00 |
| SEASON | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
| ... | ... | ... | ... | ... | ... |
| ORIGIN_STATE_NM_Utah | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Vermont | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Virginia | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Washington | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ORIGIN_STATE_NM_Wyoming | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
197 rows × 5 columns
dev_df_y.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| ARR_DEL15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
dev_df_X.info(verbose = False)
<class 'pandas.core.frame.DataFrame'> Int64Index: 187841 entries, 0 to 250151 Columns: 197 entries, CRS_DEP_TIME to ORIGIN_STATE_NM_Wyoming dtypes: float64(2), int32(1), int64(8), uint8(186) memory usage: 49.8 MB
dev_df_y.info(verbose = False)
<class 'pandas.core.frame.DataFrame'> Int64Index: 187841 entries, 0 to 250151 Columns: 1 entries, ARR_DEL15 to ARR_DEL15 dtypes: float64(1) memory usage: 2.9 MB
val_df_X.info(verbose = False)
<class 'pandas.core.frame.DataFrame'> Int64Index: 62311 entries, 120 to 250004 Columns: 197 entries, CRS_DEP_TIME to ORIGIN_STATE_NM_Wyoming dtypes: float64(2), int32(1), int64(8), uint8(186) memory usage: 16.5 MB
val_df_y.info(verbose = False)
<class 'pandas.core.frame.DataFrame'> Int64Index: 62311 entries, 120 to 250004 Columns: 1 entries, ARR_DEL15 to ARR_DEL15 dtypes: float64(1) memory usage: 973.6 KB
B.- Train/Test Split (Random Holdout)
Como el dataset de validación es lo suficientemente grande (62311 registros), usaremos como estrategia de validación el Random Holdout (no sería necesario hacer un k-Fold)
model_selection.train_test_split??
Signature: model_selection.train_test_split( *arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None, ) Source: @validate_params( { "test_size": [ Interval(RealNotInt, 0, 1, closed="neither"), Interval(numbers.Integral, 1, None, closed="left"), None, ], "train_size": [ Interval(RealNotInt, 0, 1, closed="neither"), Interval(numbers.Integral, 1, None, closed="left"), None, ], "random_state": ["random_state"], "shuffle": ["boolean"], "stratify": ["array-like", None], }, prefer_skip_nested_validation=True, ) def train_test_split( *arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None, ): """Split arrays or matrices into random train and test subsets. Quick utility that wraps input validation, ``next(ShuffleSplit().split(X, y))``, and application to input data into a single call for splitting (and optionally subsampling) data into a one-liner. Read more in the :ref:`User Guide <cross_validation>`. Parameters ---------- *arrays : sequence of indexables with same length / shape[0] Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. test_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.25. train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, RandomState instance or None, default=None Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. shuffle : bool, default=True Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None. stratify : array-like, default=None If not None, data is split in a stratified fashion, using this as the class labels. Read more in the :ref:`User Guide <stratification>`. Returns ------- splitting : list, length=2 * len(arrays) List containing train-test split of inputs. .. versionadded:: 0.16 If the input is sparse, the output will be a ``scipy.sparse.csr_matrix``. Else, output type is the same as the input type. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import train_test_split >>> X, y = np.arange(10).reshape((5, 2)), range(5) >>> X array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) >>> list(y) [0, 1, 2, 3, 4] >>> X_train, X_test, y_train, y_test = train_test_split( ... X, y, test_size=0.33, random_state=42) ... >>> X_train array([[4, 5], [0, 1], [6, 7]]) >>> y_train [2, 0, 3] >>> X_test array([[2, 3], [8, 9]]) >>> y_test [1, 4] >>> train_test_split(y, shuffle=False) [[0, 1, 2], [3, 4]] """ n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) n_train, n_test = _validate_shuffle_split( n_samples, test_size, train_size, default_test_size=0.25 ) if shuffle is False: if stratify is not None: raise ValueError( "Stratified train/test split is not implemented for shuffle=False" ) train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) else: if stratify is not None: CVClass = StratifiedShuffleSplit else: CVClass = ShuffleSplit cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) return list( chain.from_iterable( (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays ) ) File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\model_selection\_split.py Type: function
# Hacemos el train/test split
X_train, X_test, y_train, y_test = model_selection.train_test_split(
dev_df_X, # dataset development de los atributos (X)
dev_df_y, # dataset development del target (y)
test_size = 0.30, # tamaño del split aleatorio
random_state = 42 # normalmente usaremos este valor
)
Comprobamos que las distribuciones son similares
X_train.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Int64Index: 131488 entries, 207503 to 156518 Columns: 197 entries, CRS_DEP_TIME to ORIGIN_STATE_NM_Wyoming dtypes: float64(2), int32(1), int64(8), uint8(186) memory usage: 34.9 MB
X_test.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Int64Index: 56353 entries, 91964 to 105925 Columns: 197 entries, CRS_DEP_TIME to ORIGIN_STATE_NM_Wyoming dtypes: float64(2), int32(1), int64(8), uint8(186) memory usage: 14.9 MB
X_train.describe().T.head()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CRS_DEP_TIME | 131,488.00 | 1,337.92 | 526.82 | 4.00 | 917.00 | 1,300.00 | 1,745.00 | 2,359.00 |
| TAXI_OUT | 131,488.00 | -53.87 | 261.58 | -999.00 | 12.00 | 15.00 | 21.00 | 166.00 |
| CRS_ARR_TIME | 131,488.00 | 1,418.96 | 579.74 | 1.00 | 939.00 | 1,457.00 | 1,910.00 | 2,400.00 |
| CRS_ELAPSED_TIME | 131,488.00 | 195.14 | 97.35 | 47.00 | 108.00 | 167.00 | 306.00 | 595.00 |
| SEASON | 131,488.00 | 2.20 | 1.06 | 1.00 | 1.00 | 2.00 | 3.00 | 4.00 |
X_test.describe().T.head()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CRS_DEP_TIME | 56,353.00 | 1,335.12 | 525.68 | 3.00 | 915.00 | 1,300.00 | 1,740.00 | 2,359.00 |
| TAXI_OUT | 56,353.00 | -52.08 | 258.55 | -999.00 | 12.00 | 15.00 | 21.00 | 154.00 |
| CRS_ARR_TIME | 56,353.00 | 1,419.65 | 580.85 | 1.00 | 944.00 | 1,457.00 | 1,912.00 | 2,400.00 |
| CRS_ELAPSED_TIME | 56,353.00 | 195.47 | 97.49 | 47.00 | 108.00 | 167.00 | 306.00 | 595.00 |
| SEASON | 56,353.00 | 2.20 | 1.06 | 1.00 | 1.00 | 2.00 | 3.00 | 4.00 |
Todas los atributos del train tienen una distribución similar a los del test (medias y desviaciones parecidas)
y_train.describe().T.head()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ARR_DEL15 | 131,488.00 | 0.21 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
y_test.describe().T.head()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ARR_DEL15 | 56,353.00 | 0.21 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
La distribución del target se comporta igual en el train y en el test
C.- Model Definition
Procedimiento para entrenar el modelo:
DecisionTreeClassifier??
Init signature: DecisionTreeClassifier( *, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, ) Source: class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): """A decision tree classifier. Read more in the :ref:`User Guide <tree>`. Parameters ---------- criterion : {"gini", "entropy", "log_loss"}, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "log_loss" and "entropy" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. splitter : {"best", "random"}, default="best" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split. max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for fractions. min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. min_weight_fraction_leaf : float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. max_features : int, float or {"auto", "sqrt", "log2"}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `max(1, int(max_features * n_features_in_))` features are considered at each split. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. random_state : int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``"best"``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary <random_state>` for details. max_leaf_nodes : int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 class_weight : dict, list of dict or "balanced", default=None Weights associated with classes in the form ``{class_label: weight}``. If None, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. Note that for multioutput (including multilabel) weights should be defined for each class of every column in its own dict. For example, for four-class multilabel classification weights should be [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of [{1:1}, {2:5}, {3:1}, {4:1}]. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See :ref:`minimal_cost_complexity_pruning` for details. .. versionadded:: 0.22 Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray The classes labels (single output problem), or a list of arrays of class labels (multi-output problem). feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance [4]_. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). See :func:`sklearn.inspection.permutation_importance` as an alternative. max_features_ : int The inferred value of max_features. n_classes_ : int or list of int The number of classes (for single output problems), or a list containing the number of classes for each output (for multi-output problems). n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 n_outputs_ : int The number of outputs when ``fit`` is performed. tree_ : Tree instance The underlying Tree object. Please refer to ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. See Also -------- DecisionTreeRegressor : A decision tree regressor. Notes ----- The default values for the parameters controlling the size of the trees (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and unpruned trees which can potentially be very large on some data sets. To reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. The :meth:`predict` method operates using the :func:`numpy.argmax` function on the outputs of :meth:`predict_proba`. This means that in case the highest predicted probabilities are tied, the classifier will predict the tied class with the lowest index in :term:`classes_`. References ---------- .. [1] https://en.wikipedia.org/wiki/Decision_tree_learning .. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification and Regression Trees", Wadsworth, Belmont, CA, 1984. .. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical Learning", Springer, 2009. .. [4] L. Breiman, and A. Cutler, "Random Forests", https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm Examples -------- >>> from sklearn.datasets import load_iris >>> from sklearn.model_selection import cross_val_score >>> from sklearn.tree import DecisionTreeClassifier >>> clf = DecisionTreeClassifier(random_state=0) >>> iris = load_iris() >>> cross_val_score(clf, iris.data, iris.target, cv=10) ... # doctest: +SKIP ... array([ 1. , 0.93..., 0.86..., 0.93..., 0.93..., 0.93..., 0.93..., 1. , 0.93..., 1. ]) """ _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)], "class_weight": [dict, list, StrOptions({"balanced"}), None], } def __init__( self, *, criterion="gini", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, ): super().__init__( criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, ) @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree classifier from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels) as integers or strings. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. Splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you're doing. Returns ------- self : DecisionTreeClassifier Fitted estimator. """ super()._fit( X, y, sample_weight=sample_weight, check_input=check_input, ) return self def predict_proba(self, X, check_input=True): """Predict class probabilities of the input samples X. The predicted class probability is the fraction of samples of the same class in a leaf. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you're doing. Returns ------- proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \ such arrays if n_outputs > 1 The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ check_is_fitted(self) X = self._validate_X_predict(X, check_input) proba = self.tree_.predict(X) if self.n_outputs_ == 1: proba = proba[:, : self.n_classes_] normalizer = proba.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba /= normalizer return proba else: all_proba = [] for k in range(self.n_outputs_): proba_k = proba[:, k, : self.n_classes_[k]] normalizer = proba_k.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba_k /= normalizer all_proba.append(proba_k) return all_proba def predict_log_proba(self, X): """Predict class log-probabilities of the input samples X. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. Returns ------- proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \ such arrays if n_outputs > 1 The class log-probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ proba = self.predict_proba(X) if self.n_outputs_ == 1: return np.log(proba) else: for k in range(self.n_outputs_): proba[k] = np.log(proba[k]) return proba def _more_tags(self): # XXX: nan is only support for dense arrays, but we set this for common test to # pass, specifically: check_estimators_nan_inf allow_nan = self.splitter == "best" and self.criterion in { "gini", "log_loss", "entropy", } return {"multilabel": True, "allow_nan": allow_nan} File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\tree\_classes.py Type: ABCMeta Subclasses: ExtraTreeClassifier
# Instanciamos el modelo
dt = DecisionTreeClassifier(
criterion = 'gini', # criterio para medir la entropía (desorden). Se puede utilizar: "gini", "entropy" (los dos funcionan bien)
splitter = 'best', # Estrategia para elegir la división en cada nodo. Se puede utilizar: "best", "random", ... Normalmente "best"
max_depth = 4, # Profundidad máxima del arbol. Importante, regulará el sobreajuste
min_samples_split = 500, # Número mínimo de muestras requeridas en cada subgrupo después de cada corte
# min_samples_leaf=1, # Importante en regresión (ayuda a suavizar el modelo)
# min_weight_fraction_leaf=0.0,
# max_features=None, # Número máximo de atributos que se usarán al hacer la división en cada nodo
random_state=42, # Controla la aleatoriedad del estimador. Normalmente usaremos 42
# max_leaf_nodes=None,
# min_impurity_decrease=0.0,
# min_impurity_split=None,
# class_weight=None,
# presort=False,
)
dt.fit??
Signature: dt.fit(X, y, sample_weight=None, check_input=True) Source: @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree classifier from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels) as integers or strings. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. Splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you're doing. Returns ------- self : DecisionTreeClassifier Fitted estimator. """ super()._fit( X, y, sample_weight=sample_weight, check_input=check_input, ) return self File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\tree\_classes.py Type: method
Entrenamos el modelo dándole unas reglas a partir de los datos
%%time
dt.fit(
X = X_train, # Dataframe de entrenamiento con atributos y sin target
y = y_train, # Dataframe de entrenamiento con el target
# sample_weight=None, # Ponderación de las muestras
# check_input=True, # No se suele usar
)
Wall time: 652 ms
DecisionTreeClassifier(max_depth=4, min_samples_split=500, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(max_depth=4, min_samples_split=500, random_state=42)
Exportamos con formato DOT y lo visualizamos
export_graphviz??
Signature: export_graphviz( decision_tree, out_file=None, *, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, leaves_parallel=False, impurity=True, node_ids=False, proportion=False, rotate=False, rounded=False, special_characters=False, precision=3, fontname='helvetica', ) Source: @validate_params( { "decision_tree": "no_validation", "out_file": [str, None, HasMethods("write")], "max_depth": [Interval(Integral, 0, None, closed="left"), None], "feature_names": ["array-like", None], "class_names": ["array-like", "boolean", None], "label": [StrOptions({"all", "root", "none"})], "filled": ["boolean"], "leaves_parallel": ["boolean"], "impurity": ["boolean"], "node_ids": ["boolean"], "proportion": ["boolean"], "rotate": ["boolean"], "rounded": ["boolean"], "special_characters": ["boolean"], "precision": [Interval(Integral, 0, None, closed="left"), None], "fontname": [str], }, prefer_skip_nested_validation=True, ) def export_graphviz( decision_tree, out_file=None, *, max_depth=None, feature_names=None, class_names=None, label="all", filled=False, leaves_parallel=False, impurity=True, node_ids=False, proportion=False, rotate=False, rounded=False, special_characters=False, precision=3, fontname="helvetica", ): """Export a decision tree in DOT format. This function generates a GraphViz representation of the decision tree, which is then written into `out_file`. Once exported, graphical renderings can be generated using, for example:: $ dot -Tps tree.dot -o tree.ps (PostScript format) $ dot -Tpng tree.dot -o tree.png (PNG format) The sample counts that are shown are weighted with any sample_weights that might be present. Read more in the :ref:`User Guide <tree>`. Parameters ---------- decision_tree : object The decision tree estimator to be exported to GraphViz. out_file : object or str, default=None Handle or name of the output file. If ``None``, the result is returned as a string. .. versionchanged:: 0.20 Default of out_file changed from "tree.dot" to None. max_depth : int, default=None The maximum depth of the representation. If None, the tree is fully generated. feature_names : array-like of shape (n_features,), default=None An array containing the feature names. If None, generic names will be used ("x[0]", "x[1]", ...). class_names : array-like of shape (n_classes,) or bool, default=None Names of each of the target classes in ascending numerical order. Only relevant for classification and not supported for multi-output. If ``True``, shows a symbolic representation of the class name. label : {'all', 'root', 'none'}, default='all' Whether to show informative labels for impurity, etc. Options include 'all' to show at every node, 'root' to show only at the top root node, or 'none' to not show at any node. filled : bool, default=False When set to ``True``, paint nodes to indicate majority class for classification, extremity of values for regression, or purity of node for multi-output. leaves_parallel : bool, default=False When set to ``True``, draw all leaf nodes at the bottom of the tree. impurity : bool, default=True When set to ``True``, show the impurity at each node. node_ids : bool, default=False When set to ``True``, show the ID number on each node. proportion : bool, default=False When set to ``True``, change the display of 'values' and/or 'samples' to be proportions and percentages respectively. rotate : bool, default=False When set to ``True``, orient tree left to right rather than top-down. rounded : bool, default=False When set to ``True``, draw node boxes with rounded corners. special_characters : bool, default=False When set to ``False``, ignore special characters for PostScript compatibility. precision : int, default=3 Number of digits of precision for floating point in the values of impurity, threshold and value attributes of each node. fontname : str, default='helvetica' Name of font used to render text. Returns ------- dot_data : str String representation of the input tree in GraphViz dot format. Only returned if ``out_file`` is None. .. versionadded:: 0.18 Examples -------- >>> from sklearn.datasets import load_iris >>> from sklearn import tree >>> clf = tree.DecisionTreeClassifier() >>> iris = load_iris() >>> clf = clf.fit(iris.data, iris.target) >>> tree.export_graphviz(clf) 'digraph Tree {... """ if feature_names is not None: feature_names = check_array( feature_names, ensure_2d=False, dtype=None, ensure_min_samples=0 ) if class_names is not None and not isinstance(class_names, bool): class_names = check_array( class_names, ensure_2d=False, dtype=None, ensure_min_samples=0 ) check_is_fitted(decision_tree) own_file = False return_string = False try: if isinstance(out_file, str): out_file = open(out_file, "w", encoding="utf-8") own_file = True if out_file is None: return_string = True out_file = StringIO() exporter = _DOTTreeExporter( out_file=out_file, max_depth=max_depth, feature_names=feature_names, class_names=class_names, label=label, filled=filled, leaves_parallel=leaves_parallel, impurity=impurity, node_ids=node_ids, proportion=proportion, rotate=rotate, rounded=rounded, special_characters=special_characters, precision=precision, fontname=fontname, ) exporter.export(decision_tree) if return_string: return exporter.out_file.getvalue() finally: if own_file: out_file.close() File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\tree\_export.py Type: function
# Exportamos el árbol con formato DOT (texto) utilizando la librería graphviz (sirve para visualizar diagramas)
dot_data = export_graphviz(
decision_tree = dt, # DecisionTree que queremos exportar
out_file = None, # Nombre del archivo para poder exportarlo (None-->Se devuelve como un string)
max_depth = None, # Profundidad máxima de la representación (visualiza las X primeras divisiones)
feature_names = X_test.columns, # Nombre de los atributos (variables del X_test)
class_names = ['No Delay', 'Delay'], # Nombre de las clases del target (0-->No delay, 1-->Delay)
# label='all',
filled = True, # True --> en Clasificación pinta los nodos para indicar la clase mayoritaria
# leaves_parallel=False,
impurity = True, # Muestra la impureza de cada nodo
# node_ids=False, # Muestra el ID de cada nodo
proportion = True, # Visualización de las muestras en proporciones (Si es false, aparecen los totales)
rotate = True, # Orienta el árbol de izquierda a derecha en lugar de arriba a abajo
rounded = True, # Redondea las esquinas de las cajas de los nodos
# special_characters=False,
precision = 3, # Dígitos de precisión en la impureza, el threshold y los atributos de cada nodo
fontname = 'helvetica'
)
# Visualizamos el árbol
plt.figure(figsize=(8,8))
graph_dt = graphviz.Source(dot_data, format = 'png') # transformo el texto (formato DOT) en una imagen
graph_dt
<Figure size 800x800 with 0 Axes>
La construcción del árbol de decisión se basa en encontrar una métrica capaz de medir el desorden en un conjunto de datos en relación al valor del atributo, y que se llama entropía (). Por lo tanto, la entropía mide la cantidad de desorden o incertidumbre en el sistema.
Existen varias fórmulas para el cálculo de la entropía. Una de ellas:
Image('pictures/entropia.jpg')
También se suele utilizar para medir la entopía el índice Gini.
Image('pictures/gini.jpg')
donde pi es la probabilidad de cada una de las clases
# Gráfica del gini
x = np.linspace(0, 1, 100)
y = 1 - x ** 2 - (1 - x) ** 2
fig, ax = plt.subplots()
ax.plot(x, y)
ax.grid(color = "#EEEEEE", zorder = 0)
ax.set_xlabel("p")
ax.set_ylabel("Índice Gini")
plt.show()
Esto es para dos clases. El índice Gini aumenta a medida que aumenta el número de clases. Por ejemplo, si tenemos 4 clases, el valor máximo de Gini sería:
1 - 0.25**2 - 0.25**2 - 0.25**2 - 0.25**2
0.75
En cada nodo o iteracción nos encontramos con la siguiente información:
Y así sucesivamente hasta un total de 4 iteracciones, que es la profundidad de árbol que le hemos dado.
A la última capa se le llama nodo terminal. El número de nodos terminales será: 2^profundidad. Por lo tanto, el número de probabilidades diferentes que obtenemos será:
# 16 nodos terminales --> 16 probabilidades diferentes
2**4
16
El cálculo del Gini se ha hecho de la siguiente manera:
Gini = 1 - p1^2 - p2^2
# Primera iteración
Gini_1 = 1 - 0.785**2 - 0.215**2
Gini_1
0.33754999999999996
# Segunda iteración. CRS_ARR_TIME
Gini_2A = 1 - 0.807**2 - 0.193**2
Gini_2A
0.31150199999999995
# Segunda iteración. TAXI_OUT
Gini_2B = 1 - 0.361**2 - 0.639**2
Gini_2B
0.461358
Gini vs Entropía:
El fín del DecisionTree es la reducción de la entropía. A la reducción de la entropía se la conoce como Ganancia de Información (Information Gain). IQ para un corte basado en el atributo Q vendría dado por la expresión:
Image('pictures/information_gain.jpg')
siendo:
En cada corte, el algoritmo se quedará con el atributo con el que se obtenga una IG más alta, o lo que es lo mismo, con el que se obtenga una entropía más baja
Se puede apreciar que, para el algoritmo, la variable más relevante es TAXI_OUT.
Recordamos el análisis de estas variables relevantes con el target:
# df sin valores nulos de TAXI_OUT (recordemos que les habíamos dado un valor extremo de -999)
df_taxi_out_NotNull = df[(df['TAXI_OUT'] >= 0)]
df_taxi_out_NotNull.head()
| CRS_DEP_TIME | TAXI_OUT | CRS_ARR_TIME | ARR_DEL15 | CRS_ELAPSED_TIME | SEASON | YEAR | MONTH | WEEK | WEEKDAY | ... | ORIGIN_STATE_NM_Puerto Rico | ORIGIN_STATE_NM_South Carolina | ORIGIN_STATE_NM_Tennessee | ORIGIN_STATE_NM_Texas | ORIGIN_STATE_NM_U.S. Virgin Islands | ORIGIN_STATE_NM_Utah | ORIGIN_STATE_NM_Vermont | ORIGIN_STATE_NM_Virginia | ORIGIN_STATE_NM_Washington | ORIGIN_STATE_NM_Wyoming | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1359 | 27.00 | 1719 | 0.00 | 140.00 | 1 | 2018 | 2 | 5 | 3 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1021 | 11.00 | 1158 | 0.00 | 97.00 | 1 | 2018 | 2 | 5 | 5 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1325 | 31.00 | 1454 | 0.00 | 89.00 | 1 | 2018 | 2 | 6 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 600 | 19.00 | 711 | 0.00 | 71.00 | 1 | 2018 | 2 | 6 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1715 | 11.00 | 1930 | 1.00 | 135.00 | 1 | 2018 | 2 | 6 | 6 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 198 columns
# Análisis de TAXI_OUT
sns.boxplot(data = df_taxi_out_NotNull, x = target, y = 'TAXI_OUT' );
# Análisis de CRS_DEP_TIME
sns.boxplot(data = df, x = target, y = 'CRS_DEP_TIME' );
Otra forma de ver las variables más relevantes es mediante el feature_importances
# Variables más relevantes
dt.feature_importances_ # array de coeficientes que el modelo aplica a cada atributo para ponderar
array([0.08728815, 0.70279531, 0.15668835, 0.00095434, 0. ,
0. , 0. , 0.0188241 , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0.00952253, 0. , 0. , 0.02392722,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. ])
# Convertimos el array en Series Pandas
dt_feature_importance = pd.Series(
data = dt.feature_importances_, # array de coeficientes
index = X_train.columns, # nombre de los atributos
).sort_values(ascending = False) # Ordeno de mayor a menor importancia
dt_feature_importance
TAXI_OUT 0.70
CRS_ARR_TIME 0.16
CRS_DEP_TIME 0.09
OP_CARRIER_AIRLINE_ID_20409.0 0.02
WEEK 0.02
...
ORIGIN_PHX 0.00
ORIGIN_PIT 0.00
ORIGIN_PSE 0.00
ORIGIN_PSP 0.00
ORIGIN_STATE_NM_Wyoming 0.00
Length: 197, dtype: float64
# Visualizamos los atributos importantes
plt.figure(figsize=(15,5))
dt_feature_importance.head(15).plot(kind = 'bar', ylabel = 'Coeficientes de ponderación') # Vemos las 15 más importantes
plt.title('Feature Importances');
Podemos ver que las variables más importantes son el TAXI_OUT y el CRS_ARR_TIME, como ya habíamos visto en la representación del diagrama de árbol.
En el primer entrenamiento del modelo suelen utilizarse todas las variables y el modelo final se suele entrenar con las 10-15 variables más importantes
D.- Model Evaluation
El modelo ha generado una serie de reglas en base al X_train que le he pasado. Ahora le paso el X_test para que cada uno de sus registros recorran el árbol que se ha generado y obtener una predicción de cada vuelo (de cada registro). Finalmente se comparan los valores de predicción del X_test con el valor correspondiente del y_test (valores del target).
X_test.info(verbose = False)
<class 'pandas.core.frame.DataFrame'> Int64Index: 56353 entries, 91964 to 105925 Columns: 197 entries, CRS_DEP_TIME to ORIGIN_STATE_NM_Wyoming dtypes: float64(2), int32(1), int64(8), uint8(186) memory usage: 14.9 MB
dt.predict??
Signature: dt.predict(X, check_input=True) Source: def predict(self, X, check_input=True): """Predict class or regression value for X. For a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you're doing. Returns ------- y : array-like of shape (n_samples,) or (n_samples, n_outputs) The predicted classes, or the predict values. """ check_is_fitted(self) X = self._validate_X_predict(X, check_input) proba = self.tree_.predict(X) n_samples = X.shape[0] # Classification if is_classifier(self): if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: class_type = self.classes_[0].dtype predictions = np.zeros((n_samples, self.n_outputs_), dtype=class_type) for k in range(self.n_outputs_): predictions[:, k] = self.classes_[k].take( np.argmax(proba[:, k], axis=1), axis=0 ) return predictions # Regression else: if self.n_outputs_ == 1: return proba[:, 0] else: return proba[:, :, 0] File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\tree\_classes.py Type: method
# Dataframe de la predicción de X_test
y_test_prediction = pd.DataFrame(
dt.predict(X = X_test), # Array de la predicción de X_test
index = y_test.index, # Le doy el mismo índice que el de y_test
columns=['DelayPrediction']) # Le doy un nombre a la columna
y_test_prediction.head()
| DelayPrediction | |
|---|---|
| 91964 | 1.00 |
| 205250 | 0.00 |
| 156190 | 0.00 |
| 117192 | 0.00 |
| 33706 | 0.00 |
Podemos unir en un mismo dataframe y_test e y_test_prediction, ya que comparten el mismo índice
# Comparación entre el target y la predicción del X_test
results_df = y_test.join(y_test_prediction, how= 'inner')
results_df
| ARR_DEL15 | DelayPrediction | |
|---|---|---|
| 91964 | 1.00 | 1.00 |
| 205250 | 1.00 | 0.00 |
| 156190 | 0.00 | 0.00 |
| 117192 | 0.00 | 0.00 |
| 33706 | 0.00 | 0.00 |
| ... | ... | ... |
| 53676 | 0.00 | 0.00 |
| 23519 | 0.00 | 0.00 |
| 22023 | 0.00 | 0.00 |
| 14082 | 0.00 | 0.00 |
| 105925 | 0.00 | 0.00 |
56353 rows × 2 columns
Podemos añadir al dataframe una columna que nos indique si el modelo hecho la predicción bien (1) o no (0).
results_df['Success'] = (results_df[target] == results_df['DelayPrediction']).astype(int)
results_df.sample(20)
| ARR_DEL15 | DelayPrediction | Success | |
|---|---|---|---|
| 138032 | 0.00 | 0.00 | 1 |
| 79697 | 1.00 | 0.00 | 0 |
| 242727 | 0.00 | 0.00 | 1 |
| 121409 | 0.00 | 0.00 | 1 |
| 60318 | 1.00 | 0.00 | 0 |
| 191881 | 0.00 | 0.00 | 1 |
| 189967 | 0.00 | 0.00 | 1 |
| 180416 | 0.00 | 0.00 | 1 |
| 58620 | 0.00 | 0.00 | 1 |
| 151020 | 0.00 | 0.00 | 1 |
| 166166 | 0.00 | 0.00 | 1 |
| 129765 | 0.00 | 0.00 | 1 |
| 205113 | 0.00 | 0.00 | 1 |
| 175897 | 0.00 | 0.00 | 1 |
| 51117 | 0.00 | 0.00 | 1 |
| 244037 | 0.00 | 0.00 | 1 |
| 58724 | 0.00 | 0.00 | 1 |
| 37483 | 0.00 | 0.00 | 1 |
| 176953 | 0.00 | 0.00 | 1 |
| 161107 | 1.00 | 0.00 | 0 |
Determinamos el porcentaje de acierto del modelo
# Número de vuelos del X_test
numero_vuelos = results_df['Success'].count()
numero_vuelos
56353
# Número de aciertos del modelo
numero_aciertos = results_df['Success'].sum()
numero_aciertos
45147
# Media de aciertos del modelo (probabilidad de que el modelo acierte) --> Accuracy
probabilidad_acierto = results_df['Success'].mean()
probabilidad_acierto
0.8011463453587209
print('El modelo acierta {} registros de un total de {} vuelos, por lo tanto el Accuracy es {}.'.format(numero_aciertos, numero_vuelos, probabilidad_acierto))
El modelo acierta 45147 registros de un total de 56353 vuelos, por lo tanto el Accuracy es 0.8011463453587209.
Image('pictures/confusion_matrix.jpg')
Lo ideal en un modelo sería que se equivocase la mitad en FN y la otra mitad en FP. Aunque hay casos de uso donde es preferible que se equivoque menos en uno que en otro. Por ejemplo en un diagnóstico médico es preferible que se equivoque menos en el FN (si estoy enfermo y predice que no lo estoy, es más costoso) que en el FP (si no estoy enfermo y predice que sí lo estoy, es menos costoso).
Para determinar la Confusion Matriz vamos a utilizar la función crosstab() de Pandas, que nos creará una tabla de conteo a partir de los datos del target del X_test y de la predicción del X_test.
pd.crosstab?
Signature: pd.crosstab( index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins: 'bool' = False, margins_name: 'str' = 'All', dropna: 'bool' = True, normalize=False, ) -> 'DataFrame' Docstring: Compute a simple cross tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed. Parameters ---------- index : array-like, Series, or list of arrays/Series Values to group by in the rows. columns : array-like, Series, or list of arrays/Series Values to group by in the columns. values : array-like, optional Array of values to aggregate according to the factors. Requires `aggfunc` be specified. rownames : sequence, default None If passed, must match number of row arrays passed. colnames : sequence, default None If passed, must match number of column arrays passed. aggfunc : function, optional If specified, requires `values` be specified as well. margins : bool, default False Add row/column margins (subtotals). margins_name : str, default 'All' Name of the row/column that will contain the totals when margins is True. dropna : bool, default True Do not include columns whose entries are all NaN. normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False Normalize by dividing all values by the sum of values. - If passed 'all' or `True`, will normalize over all values. - If passed 'index' will normalize over each row. - If passed 'columns' will normalize over each column. - If margins is `True`, will also normalize margin values. Returns ------- DataFrame Cross tabulation of the data. See Also -------- DataFrame.pivot : Reshape data based on column values. pivot_table : Create a pivot table as a DataFrame. Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified. Any input passed containing Categorical data will have **all** of its categories included in the cross-tabulation, even if the actual data does not contain any instances of a particular category. In the event that there aren't overlapping indexes an empty DataFrame will be returned. Reference :ref:`the user guide <reshaping.crosstabulations>` for more examples. Examples -------- >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", ... "bar", "bar", "foo", "foo", "foo"], dtype=object) >>> b = np.array(["one", "one", "one", "two", "one", "one", ... "one", "two", "two", "two", "one"], dtype=object) >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", ... "shiny", "dull", "shiny", "shiny", "shiny"], ... dtype=object) >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 Here 'c' and 'f' are not represented in the data and will not be shown in the output because dropna is True by default. Set dropna=False to preserve categories with no data. >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) >>> pd.crosstab(foo, bar) col_0 d e row_0 a 1 0 b 0 1 >>> pd.crosstab(foo, bar, dropna=False) col_0 d e f row_0 a 1 0 0 b 0 1 0 c 0 0 0 File: c:\users\jagui\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py Type: function
confusion_matrix = pd.crosstab(
results_df[target], # le paso el target del X_test (Es una Serie --> 1 dimensión)
results_df['DelayPrediction'] # le paso la predicción del X_test (Es una Serie --> 1 dimensión)
)
confusion_matrix
| DelayPrediction | 0.00 | 1.00 |
|---|---|---|
| ARR_DEL15 | ||
| 0.00 | 43705 | 559 |
| 1.00 | 10647 | 1442 |
# Asignamos los valores correspondientes a cada parte de la matriz de confusión
TP = confusion_matrix.iloc[1,1]
TN = confusion_matrix.iloc[0,0]
FP = confusion_matrix.iloc[0,1]
FN = confusion_matrix.iloc[1,0]
print('TP = {}, FN = {}, FP = {}, TN = {}'.format(TP, FN, FP, TN))
TP = 1442, FN = 10647, FP = 559, TN = 43705
Una vez calculada la matriz de confusión, podemos determinar diferentes métricas de validación con ella:
Accuracy o Precisión
dt_acc = (TP + TN) / (TP + TN + FP + FN) # Número de aciertos entre el número total
dt_acc
0.8011463453587209
Otra forma de obtener la metrica es utilizando la librería metrics de scikit-learn (calcula la Confusion Matrix internamente)
metrics??
Type: module String form: <module 'sklearn.metrics' from 'c:\\Users\\jagui\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\__init__.py'> File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\metrics\__init__.py Source: """ The :mod:`sklearn.metrics` module includes score functions, performance metrics and pairwise metrics and distance computations. """ from . import cluster from ._classification import ( accuracy_score, balanced_accuracy_score, brier_score_loss, class_likelihood_ratios, classification_report, cohen_kappa_score, confusion_matrix, f1_score, fbeta_score, hamming_loss, hinge_loss, jaccard_score, log_loss, matthews_corrcoef, multilabel_confusion_matrix, precision_recall_fscore_support, precision_score, recall_score, zero_one_loss, ) from ._dist_metrics import DistanceMetric from ._plot.confusion_matrix import ConfusionMatrixDisplay from ._plot.det_curve import DetCurveDisplay from ._plot.precision_recall_curve import PrecisionRecallDisplay from ._plot.regression import PredictionErrorDisplay from ._plot.roc_curve import RocCurveDisplay from ._ranking import ( auc, average_precision_score, coverage_error, dcg_score, det_curve, label_ranking_average_precision_score, label_ranking_loss, ndcg_score, precision_recall_curve, roc_auc_score, roc_curve, top_k_accuracy_score, ) from ._regression import ( d2_absolute_error_score, d2_pinball_score, d2_tweedie_score, explained_variance_score, max_error, mean_absolute_error, mean_absolute_percentage_error, mean_gamma_deviance, mean_pinball_loss, mean_poisson_deviance, mean_squared_error, mean_squared_log_error, mean_tweedie_deviance, median_absolute_error, r2_score, ) from ._scorer import check_scoring, get_scorer, get_scorer_names, make_scorer from .cluster import ( adjusted_mutual_info_score, adjusted_rand_score, calinski_harabasz_score, completeness_score, consensus_score, davies_bouldin_score, fowlkes_mallows_score, homogeneity_completeness_v_measure, homogeneity_score, mutual_info_score, normalized_mutual_info_score, pair_confusion_matrix, rand_score, silhouette_samples, silhouette_score, v_measure_score, ) from .pairwise import ( euclidean_distances, nan_euclidean_distances, pairwise_distances, pairwise_distances_argmin, pairwise_distances_argmin_min, pairwise_distances_chunked, pairwise_kernels, ) __all__ = [ "accuracy_score", "adjusted_mutual_info_score", "adjusted_rand_score", "auc", "average_precision_score", "balanced_accuracy_score", "calinski_harabasz_score", "check_scoring", "class_likelihood_ratios", "classification_report", "cluster", "cohen_kappa_score", "completeness_score", "ConfusionMatrixDisplay", "confusion_matrix", "consensus_score", "coverage_error", "d2_tweedie_score", "d2_absolute_error_score", "d2_pinball_score", "dcg_score", "davies_bouldin_score", "DetCurveDisplay", "det_curve", "DistanceMetric", "euclidean_distances", "explained_variance_score", "f1_score", "fbeta_score", "fowlkes_mallows_score", "get_scorer", "hamming_loss", "hinge_loss", "homogeneity_completeness_v_measure", "homogeneity_score", "jaccard_score", "label_ranking_average_precision_score", "label_ranking_loss", "log_loss", "make_scorer", "nan_euclidean_distances", "matthews_corrcoef", "max_error", "mean_absolute_error", "mean_squared_error", "mean_squared_log_error", "mean_pinball_loss", "mean_poisson_deviance", "mean_gamma_deviance", "mean_tweedie_deviance", "median_absolute_error", "mean_absolute_percentage_error", "multilabel_confusion_matrix", "mutual_info_score", "ndcg_score", "normalized_mutual_info_score", "pair_confusion_matrix", "pairwise_distances", "pairwise_distances_argmin", "pairwise_distances_argmin_min", "pairwise_distances_chunked", "pairwise_kernels", "PrecisionRecallDisplay", "precision_recall_curve", "precision_recall_fscore_support", "precision_score", "PredictionErrorDisplay", "r2_score", "rand_score", "recall_score", "RocCurveDisplay", "roc_auc_score", "roc_curve", "get_scorer_names", "silhouette_samples", "silhouette_score", "top_k_accuracy_score", "v_measure_score", "zero_one_loss", "brier_score_loss", ]
dt_acc = metrics.accuracy_score(y_test, y_test_prediction)
dt_acc
0.8011463453587209
Otra forma de obtener la metrica es utilizando el modelo (calcula la Confusion Matrix internamente)
DecisionTreeClassifier.score??
Signature: DecisionTreeClassifier.score(self, X, y, sample_weight=None) Source: def score(self, X, y, sample_weight=None): """ Return the mean accuracy on the given test data and labels. In multi-label classification, this is the subset accuracy which is a harsh metric since you require for each sample that each label set be correctly predicted. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. y : array-like of shape (n_samples,) or (n_samples, n_outputs) True labels for `X`. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- score : float Mean accuracy of ``self.predict(X)`` w.r.t. `y`. """ from .metrics import accuracy_score return accuracy_score(y, self.predict(X), sample_weight=sample_weight) File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\base.py Type: function
dt_acc = dt.score(X_test, y_test)
dt_acc
0.8011463453587209
Normalmente usaremos esta forma, el score del modelo por comodidad, pero es bueno ser capaces de generar los distintos métodos para obtener la puntuación para ver otros casos (por ejemplo, que tal han validado las días según mes).
Para saber si el Accuracy es bueno o malo, debemos compararlo con la media del target
df[target].mean()
0.21040007675333397
El 79% de los vuelos no se retrasan --> tasa de aprendizaje del 79%
El modelo predice bien en el 80% de las veces. Este dato puede parecer bueno a simple vista, pero si lo comparamos con la media del target, no es tan bueno.
El modelo más simple podría hacer una predicción de que todos los vuelos no se retrasan y obtendría un 79% de aciertos. En cambio nuestro modelo ha obtenido muy poco más, un 80%. Por lo tanto es un modelo muy malo.
F1-Score
Recall y Precision normalmente van en direcciones opuestas. Es dificil conseguir que los dos sean altos. El F1, de la forma en que podera, penaliza mucho si uno es muy bajo y el otro muy alto.
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
print('Precision = {}, Recall = {}'.format(Precision, Recall))
Precision = 0.7206396801599201, Recall = 0.11928199189345687
Tenemos un alcance muy bajo. Es un problema porque va a penalizar mucho en el f1_score
f1_score = 2 / ( 1/Precision + 1/Recall )
f1_score
0.20468417317246274
Otra forma de obtener el F1_Score:
metrics.f1_score(y_test, y_test_prediction)
0.20468417317246274
Podemos observar que al tener un Recall tan bajo, ha penalizado mucho en la métrica, obteniendo tan sólo un 0.205
Threshold = 0.5:
Si muevo el thresshold:
El problema del thresshold lo soluciono con la Curva ROC
ROC Curve y AUC (Area Under the Curve)
Image('pictures/curva_roc_auc.jpg')
La curva ROC nos ayudará a elegir el mejor umbral, el mejor thresshold. La curva empieza en el (0,0) y termina en el (1,1) que corresponde a un thresshold de 0, y este va aumentando a medida que se acerca al (0,0).
Si tomamos por ejemplo el punto (0.5, 0.85), significa que el modelo predice correctamente el 85% del total de valores positivos, con una tasa de falsos positivos del 50%.
Cada curve ROC que vemos en la figura, corresponde con un clasificador diferente. Es una buena forma de visualizar y comparar diferentes clasificadores de cada modelo se que se vaya ajustando para el mismo problema de clasificación.\ Un clasificador ramdon sería, por ejemplo, lanzar una moneda (la probabilidad de obtener cara es igual a la probabilidad de obtener cruz) --> FP = TP
El área bajo la curva (AUC) se puede utilizar como resumen de la habilidad del modelo. Un clasificador sin habilidad en cada umbral, es aquel que no puede discriminar entre las clases y predeciría una clase aleatoria o contante en todos los casos (clasificador random). El AUC de un modelo sin habilidad es de 0.5. Un modelo con habilidad perfecta tiene un AUC de 1.
TPR = 1 - FNR\ FPR = 1 - TNR
TPR y FPR, se ajustan a través del threshold
¿Qué umbral elegimos?
Debemos tener en cuenta:
Validación del modelo de forma probabilista (score)
En vez de predecir la clase (utilizando el método predict) de cada registro, vamos a predecir las probabilidades de la clase (utilizando el método predict_proba) para cada registro.
# Array de listas que contienen las probabilidades de clase para los registros de X_test
print(dt.predict_proba(X = X_test))
[[0.2 0.8 ] [0.90293986 0.09706014] [0.79220779 0.20779221] ... [0.82749644 0.17250356] [0.82749644 0.17250356] [0.82749644 0.17250356]]
# Me quedo con la segunda columna del array --> probabilidad de obtener 1
print(dt.predict_proba(X = X_test)[:,1])
[0.8 0.09706014 0.20779221 ... 0.17250356 0.17250356 0.17250356]
# Dataframe de los scorings de X_test
y_test_score = pd.DataFrame(
dt.predict_proba(X = X_test)[:,1], # Array con las probabilidades de obtener 1 para cada valor del X_test
index = y_test.index, # Le doy el mismo índice que el de y_test
columns=['DelayScore']) # Le doy un nombre a la columna
y_test_score.head()
| DelayScore | |
|---|---|
| 91964 | 0.80 |
| 205250 | 0.10 |
| 156190 | 0.21 |
| 117192 | 0.17 |
| 33706 | 0.21 |
results_df = results_df.join(y_test_score)
results_df
| ARR_DEL15 | DelayPrediction | Success | DelayScore | |
|---|---|---|---|---|
| 91964 | 1.00 | 1.00 | 1 | 0.80 |
| 205250 | 1.00 | 0.00 | 0 | 0.10 |
| 156190 | 0.00 | 0.00 | 1 | 0.21 |
| 117192 | 0.00 | 0.00 | 1 | 0.17 |
| 33706 | 0.00 | 0.00 | 1 | 0.21 |
| ... | ... | ... | ... | ... |
| 53676 | 0.00 | 0.00 | 1 | 0.10 |
| 23519 | 0.00 | 0.00 | 1 | 0.17 |
| 22023 | 0.00 | 0.00 | 1 | 0.17 |
| 14082 | 0.00 | 0.00 | 1 | 0.17 |
| 105925 | 0.00 | 0.00 | 1 | 0.17 |
56353 rows × 4 columns
results_df['DelayScore'].mean()
0.21541493586672045
Comparamos el Success y el DelayScore
# Success según el DelayScore
results_df_score = results_df.pivot_table(
index = 'DelayScore',
values='Success',
aggfunc=[len, sum, np.mean]
)
results_df_score
| len | sum | mean | |
|---|---|---|---|
| Success | Success | Success | |
| DelayScore | |||
| 0.10 | 13426 | 12134 | 0.90 |
| 0.17 | 17707 | 14645 | 0.83 |
| 0.21 | 11042 | 8777 | 0.79 |
| 0.22 | 595 | 465 | 0.78 |
| 0.29 | 1218 | 867 | 0.71 |
| 0.29 | 5161 | 3632 | 0.70 |
| 0.30 | 309 | 227 | 0.73 |
| 0.32 | 1954 | 1322 | 0.68 |
| 0.44 | 2561 | 1428 | 0.56 |
| 0.47 | 379 | 208 | 0.55 |
| 0.51 | 164 | 79 | 0.48 |
| 0.65 | 1114 | 706 | 0.63 |
| 0.80 | 264 | 214 | 0.81 |
| 0.92 | 66 | 57 | 0.86 |
| 0.94 | 64 | 61 | 0.95 |
| 0.99 | 329 | 325 | 0.99 |
results_df_score.columns
MultiIndex([( 'len', 'Success'),
( 'sum', 'Success'),
('mean', 'Success')],
)
results_df_score['mean', 'Success']
DelayScore 0.10 0.90 0.17 0.83 0.21 0.79 0.22 0.78 0.29 0.71 0.29 0.70 0.30 0.73 0.32 0.68 0.44 0.56 0.47 0.55 0.51 0.48 0.65 0.63 0.80 0.81 0.92 0.86 0.94 0.95 0.99 0.99 Name: (mean, Success), dtype: float64
#Representamos mediante un plot de barras el DelayScore frente al Success
results_df_score['mean', 'Success'].plot(kind='bar', ylabel = 'Success');
Utilizamos la librería metrics de scikit-learn para calcular el AUC y la ROC Curve
metrics.roc_auc_score??
Signature: metrics.roc_auc_score( y_true, y_score, *, average='macro', sample_weight=None, max_fpr=None, multi_class='raise', labels=None, ) Source: @validate_params( { "y_true": ["array-like"], "y_score": ["array-like"], "average": [StrOptions({"micro", "macro", "samples", "weighted"}), None], "sample_weight": ["array-like", None], "max_fpr": [Interval(Real, 0.0, 1, closed="right"), None], "multi_class": [StrOptions({"raise", "ovr", "ovo"})], "labels": ["array-like", None], }, prefer_skip_nested_validation=True, ) def roc_auc_score( y_true, y_score, *, average="macro", sample_weight=None, max_fpr=None, multi_class="raise", labels=None, ): """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) \ from prediction scores. Note: this implementation can be used with binary, multiclass and multilabel classification, but some restrictions apply (see Parameters). Read more in the :ref:`User Guide <roc_metrics>`. Parameters ---------- y_true : array-like of shape (n_samples,) or (n_samples, n_classes) True labels or binary label indicators. The binary and multiclass cases expect labels with shape (n_samples,) while the multilabel case expects binary label indicators with shape (n_samples, n_classes). y_score : array-like of shape (n_samples,) or (n_samples, n_classes) Target scores. * In the binary case, it corresponds to an array of shape `(n_samples,)`. Both probability estimates and non-thresholded decision values can be provided. The probability estimates correspond to the **probability of the class with the greater label**, i.e. `estimator.classes_[1]` and thus `estimator.predict_proba(X, y)[:, 1]`. The decision values corresponds to the output of `estimator.decision_function(X, y)`. See more information in the :ref:`User guide <roc_auc_binary>`; * In the multiclass case, it corresponds to an array of shape `(n_samples, n_classes)` of probability estimates provided by the `predict_proba` method. The probability estimates **must** sum to 1 across the possible classes. In addition, the order of the class scores must correspond to the order of ``labels``, if provided, or else to the numerical or lexicographical order of the labels in ``y_true``. See more information in the :ref:`User guide <roc_auc_multiclass>`; * In the multilabel case, it corresponds to an array of shape `(n_samples, n_classes)`. Probability estimates are provided by the `predict_proba` method and the non-thresholded decision values by the `decision_function` method. The probability estimates correspond to the **probability of the class with the greater label for each output** of the classifier. See more information in the :ref:`User guide <roc_auc_multilabel>`. average : {'micro', 'macro', 'samples', 'weighted'} or None, \ default='macro' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data. Note: multiclass ROC AUC currently only handles the 'macro' and 'weighted' averages. For multiclass targets, `average=None` is only implemented for `multi_class='ovr'` and `average='micro'` is only implemented for `multi_class='ovr'`. ``'micro'``: Calculate metrics globally by considering each element of the label indicator matrix as a label. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). ``'samples'``: Calculate metrics for each instance, and find their average. Will be ignored when ``y_true`` is binary. sample_weight : array-like of shape (n_samples,), default=None Sample weights. max_fpr : float > 0 and <= 1, default=None If not ``None``, the standardized partial AUC [2]_ over the range [0, max_fpr] is returned. For the multiclass case, ``max_fpr``, should be either equal to ``None`` or ``1.0`` as AUC ROC partial computation currently is not supported for multiclass. multi_class : {'raise', 'ovr', 'ovo'}, default='raise' Only used for multiclass targets. Determines the type of configuration to use. The default value raises an error, so either ``'ovr'`` or ``'ovo'`` must be passed explicitly. ``'ovr'``: Stands for One-vs-rest. Computes the AUC of each class against the rest [3]_ [4]_. This treats the multiclass case in the same way as the multilabel case. Sensitive to class imbalance even when ``average == 'macro'``, because class imbalance affects the composition of each of the 'rest' groupings. ``'ovo'``: Stands for One-vs-one. Computes the average AUC of all possible pairwise combinations of classes [5]_. Insensitive to class imbalance when ``average == 'macro'``. labels : array-like of shape (n_classes,), default=None Only used for multiclass targets. List of labels that index the classes in ``y_score``. If ``None``, the numerical or lexicographical order of the labels in ``y_true`` is used. Returns ------- auc : float Area Under the Curve score. See Also -------- average_precision_score : Area under the precision-recall curve. roc_curve : Compute Receiver operating characteristic (ROC) curve. RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic (ROC) curve given an estimator and some data. RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic (ROC) curve given the true and predicted values. References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ .. [2] `Analyzing a portion of the ROC curve. McClish, 1989 <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_ .. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving probability estimation trees (Section 6.2), CeDER Working Paper #IS-00-04, Stern School of Business, New York University. .. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern Recognition Letters, 27(8), 861-874. <https://www.sciencedirect.com/science/article/pii/S016786550500303X>`_ .. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area Under the ROC Curve for Multiple Class Classification Problems. Machine Learning, 45(2), 171-186. <http://link.springer.com/article/10.1023/A:1010920819831>`_ Examples -------- Binary case: >>> from sklearn.datasets import load_breast_cancer >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.metrics import roc_auc_score >>> X, y = load_breast_cancer(return_X_y=True) >>> clf = LogisticRegression(solver="liblinear", random_state=0).fit(X, y) >>> roc_auc_score(y, clf.predict_proba(X)[:, 1]) 0.99... >>> roc_auc_score(y, clf.decision_function(X)) 0.99... Multiclass case: >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> clf = LogisticRegression(solver="liblinear").fit(X, y) >>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr') 0.99... Multilabel case: >>> import numpy as np >>> from sklearn.datasets import make_multilabel_classification >>> from sklearn.multioutput import MultiOutputClassifier >>> X, y = make_multilabel_classification(random_state=0) >>> clf = MultiOutputClassifier(clf).fit(X, y) >>> # get a list of n_output containing probability arrays of shape >>> # (n_samples, n_classes) >>> y_pred = clf.predict_proba(X) >>> # extract the positive columns for each output >>> y_pred = np.transpose([pred[:, 1] for pred in y_pred]) >>> roc_auc_score(y, y_pred, average=None) array([0.82..., 0.86..., 0.94..., 0.85... , 0.94...]) >>> from sklearn.linear_model import RidgeClassifierCV >>> clf = RidgeClassifierCV().fit(X, y) >>> roc_auc_score(y, clf.decision_function(X), average=None) array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...]) """ y_type = type_of_target(y_true, input_name="y_true") y_true = check_array(y_true, ensure_2d=False, dtype=None) y_score = check_array(y_score, ensure_2d=False) if y_type == "multiclass" or ( y_type == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2 ): # do not support partial ROC computation for multiclass if max_fpr is not None and max_fpr != 1.0: raise ValueError( "Partial AUC computation not available in " "multiclass setting, 'max_fpr' must be" " set to `None`, received `max_fpr={0}` " "instead".format(max_fpr) ) if multi_class == "raise": raise ValueError("multi_class must be in ('ovo', 'ovr')") return _multiclass_roc_auc_score( y_true, y_score, labels, multi_class, average, sample_weight ) elif y_type == "binary": labels = np.unique(y_true) y_true = label_binarize(y_true, classes=labels)[:, 0] return _average_binary_score( partial(_binary_roc_auc_score, max_fpr=max_fpr), y_true, y_score, average, sample_weight=sample_weight, ) else: # multilabel-indicator return _average_binary_score( partial(_binary_roc_auc_score, max_fpr=max_fpr), y_true, y_score, average, sample_weight=sample_weight, ) File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\metrics\_ranking.py Type: function
# AUC
dt_AUC = metrics.roc_auc_score(
y_test,
y_test_score)
print('AUC: %.3f' % dt_AUC)
AUC: 0.680
No está mal. A partir de 0.75 es un muy buen modelo
Dibujamos la Curva ROC
metrics.roc_curve??
Signature: metrics.roc_curve( y_true, y_score, *, pos_label=None, sample_weight=None, drop_intermediate=True, ) Source: @validate_params( { "y_true": ["array-like"], "y_score": ["array-like"], "pos_label": [Real, str, "boolean", None], "sample_weight": ["array-like", None], "drop_intermediate": ["boolean"], }, prefer_skip_nested_validation=True, ) def roc_curve( y_true, y_score, *, pos_label=None, sample_weight=None, drop_intermediate=True ): """Compute Receiver operating characteristic (ROC). Note: this implementation is restricted to the binary classification task. Read more in the :ref:`User Guide <roc_metrics>`. Parameters ---------- y_true : array-like of shape (n_samples,) True binary labels. If labels are not either {-1, 1} or {0, 1}, then pos_label should be explicitly given. y_score : array-like of shape (n_samples,) Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers). pos_label : int, float, bool or str, default=None The label of the positive class. When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1}, ``pos_label`` is set to 1, otherwise an error will be raised. sample_weight : array-like of shape (n_samples,), default=None Sample weights. drop_intermediate : bool, default=True Whether to drop some suboptimal thresholds which would not appear on a plotted ROC curve. This is useful in order to create lighter ROC curves. .. versionadded:: 0.17 parameter *drop_intermediate*. Returns ------- fpr : ndarray of shape (>2,) Increasing false positive rates such that element i is the false positive rate of predictions with score >= `thresholds[i]`. tpr : ndarray of shape (>2,) Increasing true positive rates such that element `i` is the true positive rate of predictions with score >= `thresholds[i]`. thresholds : ndarray of shape (n_thresholds,) Decreasing thresholds on the decision function used to compute fpr and tpr. `thresholds[0]` represents no instances being predicted and is arbitrarily set to `np.inf`. See Also -------- RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic (ROC) curve given an estimator and some data. RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic (ROC) curve given the true and predicted values. det_curve: Compute error rates for different probability thresholds. roc_auc_score : Compute the area under the ROC curve. Notes ----- Since the thresholds are sorted from low to high values, they are reversed upon returning them to ensure they correspond to both ``fpr`` and ``tpr``, which are sorted in reversed order during their calculation. An arbitrary threshold is added for the case `tpr=0` and `fpr=0` to ensure that the curve starts at `(0, 0)`. This threshold corresponds to the `np.inf`. References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition Letters, 2006, 27(8):861-874. Examples -------- >>> import numpy as np >>> from sklearn import metrics >>> y = np.array([1, 1, 2, 2]) >>> scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) >>> fpr array([0. , 0. , 0.5, 0.5, 1. ]) >>> tpr array([0. , 0.5, 0.5, 1. , 1. ]) >>> thresholds array([ inf, 0.8 , 0.4 , 0.35, 0.1 ]) """ fps, tps, thresholds = _binary_clf_curve( y_true, y_score, pos_label=pos_label, sample_weight=sample_weight ) # Attempt to drop thresholds corresponding to points in between and # collinear with other points. These are always suboptimal and do not # appear on a plotted ROC curve (and thus do not affect the AUC). # Here np.diff(_, 2) is used as a "second derivative" to tell if there # is a corner at the point. Both fps and tps must be tested to handle # thresholds with multiple data points (which are combined in # _binary_clf_curve). This keeps all cases where the point should be kept, # but does not drop more complicated cases like fps = [1, 3, 7], # tps = [1, 2, 4]; there is no harm in keeping too many thresholds. if drop_intermediate and len(fps) > 2: optimal_idxs = np.where( np.r_[True, np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True] )[0] fps = fps[optimal_idxs] tps = tps[optimal_idxs] thresholds = thresholds[optimal_idxs] # Add an extra threshold position # to make sure that the curve starts at (0, 0) tps = np.r_[0, tps] fps = np.r_[0, fps] # get dtype of `y_score` even if it is an array-like thresholds = np.r_[np.inf, thresholds] if fps[-1] <= 0: warnings.warn( "No negative samples in y_true, false positive value should be meaningless", UndefinedMetricWarning, ) fpr = np.repeat(np.nan, fps.shape) else: fpr = fps / fps[-1] if tps[-1] <= 0: warnings.warn( "No positive samples in y_true, true positive value should be meaningless", UndefinedMetricWarning, ) tpr = np.repeat(np.nan, tps.shape) else: tpr = tps / tps[-1] return fpr, tpr, thresholds File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\metrics\_ranking.py Type: function
# Obtenemos los arrays de los ejes y el threshold
fpr, tpr, th = metrics.roc_curve(y_test, y_test_score)
fpr
array([0.00000000e+00, 9.03668896e-05, 1.58142057e-04, 3.61467558e-04,
1.49105368e-03, 1.07084764e-02, 1.26287728e-02, 1.73278511e-02,
4.95888307e-02, 7.94550877e-02, 8.45834086e-02, 1.66636544e-01,
1.86223568e-01, 1.96728719e-01, 3.95016266e-01, 7.25872040e-01,
1.00000000e+00])
tpr
array([0. , 0.02688394, 0.03192985, 0.03664488, 0.05434693,
0.11274713, 0.11928199, 0.13342708, 0.22714865, 0.27942758,
0.2862106 , 0.41268922, 0.44172388, 0.45247746, 0.63983787,
0.89312598, 1. ])
# Ha usado 17 thresholds
th
array([ inf, 0.99070385, 0.94202899, 0.91719745, 0.8 ,
0.6509105 , 0.51243781, 0.46963124, 0.44236287, 0.32379888,
0.30152672, 0.29054054, 0.28990694, 0.21970289, 0.20779221,
0.17250356, 0.09706014])
# Dibujo la Curva ROC
plt.clf() # función del módulo Pyplot de la librería Matplotlib que borra el contenido de la figura
plt.plot(fpr, tpr, marker = '.', label = 'clasifier_df')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', label = 'clasifier_random')
# axis labels
plt.xlabel('FALSE POSITIVE RATE')
plt.ylabel('TRUE POSITIVE RATE')
plt.title('ROC Curve')
# show the legend
plt.legend()
# show the plot
plt.show()
Vemos cómo se comporta el modelo con respecto al mes, es decir vemos los aciertos (Success) por meses.
results_df.head()
| ARR_DEL15 | DelayPrediction | Success | DelayScore | |
|---|---|---|---|---|
| 91964 | 1.00 | 1.00 | 1 | 0.80 |
| 205250 | 1.00 | 0.00 | 0 | 0.10 |
| 156190 | 0.00 | 0.00 | 1 | 0.21 |
| 117192 | 0.00 | 0.00 | 1 | 0.17 |
| 33706 | 0.00 | 0.00 | 1 | 0.21 |
# añadimos la variable YEAR
results_df['YEAR'] = X_test['YEAR']
# añadimos la variable MONTH
results_df['MONTH'] = X_test['MONTH']
results_df.pivot_table(
index = ['YEAR', 'MONTH'], # indexo por año y por mes, para no mezclar los datos de los meses de años diferentes
values = 'Success',
aggfunc = [len, sum, np.mean]
)
| len | sum | mean | ||
|---|---|---|---|---|
| Success | Success | Success | ||
| YEAR | MONTH | |||
| 2018 | 1 | 2796 | 2222 | 0.79 |
| 2 | 2694 | 2319 | 0.86 | |
| 3 | 2900 | 2402 | 0.83 | |
| 4 | 3132 | 2515 | 0.80 | |
| 5 | 3245 | 2568 | 0.79 | |
| 6 | 3356 | 2595 | 0.77 | |
| 7 | 3327 | 2365 | 0.71 | |
| 8 | 3444 | 2507 | 0.73 | |
| 9 | 3155 | 2545 | 0.81 | |
| 10 | 3300 | 2849 | 0.86 | |
| 11 | 3042 | 2499 | 0.82 | |
| 12 | 3196 | 2576 | 0.81 | |
| 2019 | 1 | 3117 | 2532 | 0.81 |
| 2 | 2814 | 2328 | 0.83 | |
| 3 | 3315 | 2802 | 0.85 | |
| 4 | 3065 | 2452 | 0.80 | |
| 5 | 3305 | 2716 | 0.82 | |
| 6 | 3150 | 2355 | 0.75 |
¿Cómo controlo el overfitting?
Accuracy variando la profundidad del árbol
Determinamos el accuracy de las particiones del train y del test variando el max_depth entre 1 y 18
# Vemos donde se produce el overfitting
for i in range(1, 18):
df_overfit = DecisionTreeClassifier(max_depth=i, random_state=42) # Instanciamos el modelo
df_overfit.fit(X_train, y_train) # Entrenamos el modelo
train_accuracy = df_overfit.score(X_train, y_train) # acc del train
test_accuracy = df_overfit.score(X_test, y_test) # acc del test
val_accuracy = df_overfit.score(val_df_X, val_df_y) # acc del val
print('Profundidad del árbol: {}. Acc_train: {} - Acc_test: {} - Acc_val: {}'.format(i, train_accuracy, test_accuracy, val_accuracy))
Profundidad del árbol: 1. Acc_train: 0.7984835118033585 - Acc_test: 0.7979167036360088 - Acc_val: 0.8134679270112821 Profundidad del árbol: 2. Acc_train: 0.7984835118033585 - Acc_test: 0.7979167036360088 - Acc_val: 0.8134679270112821 Profundidad del árbol: 3. Acc_train: 0.8007802993429058 - Acc_test: 0.8005962415488084 - Acc_val: 0.815072780086983 Profundidad del árbol: 4. Acc_train: 0.8012822462886348 - Acc_test: 0.8011463453587209 - Acc_val: 0.814960440371684 Profundidad del árbol: 5. Acc_train: 0.8019515088829399 - Acc_test: 0.8015367416109169 - Acc_val: 0.8155381874789362 Profundidad del árbol: 6. Acc_train: 0.8038300073010465 - Acc_test: 0.8030450907671286 - Acc_val: 0.8153295565790952 Profundidad del árbol: 7. Acc_train: 0.8059899002190314 - Acc_test: 0.8033999964509432 - Acc_val: 0.8153456051098522 Profundidad del árbol: 8. Acc_train: 0.8094198710148455 - Acc_test: 0.8037903927031391 - Acc_val: 0.8148320521256279 Profundidad del árbol: 9. Acc_train: 0.8128194207836457 - Acc_test: 0.8025837133781698 - Acc_val: 0.8125692092888896 Profundidad del árbol: 10. Acc_train: 0.8170935750790946 - Acc_test: 0.8044647135023867 - Acc_val: 0.8065831073165252 Profundidad del árbol: 11. Acc_train: 0.8221586760769044 - Acc_test: 0.8036839209979948 - Acc_val: 0.8068238352778803 Profundidad del árbol: 12. Acc_train: 0.8286915916281333 - Acc_test: 0.8024240058204533 - Acc_val: 0.7969379403315626 Profundidad del árbol: 13. Acc_train: 0.8361675590167924 - Acc_test: 0.8001171188756588 - Acc_val: 0.7941454959798431 Profundidad del árbol: 14. Acc_train: 0.8454839985397907 - Acc_test: 0.798413571593349 - Acc_val: 0.7884803646226188 Profundidad del árbol: 15. Acc_train: 0.8554697006570942 - Acc_test: 0.7938530335563324 - Acc_val: 0.7811461860666656 Profundidad del árbol: 16. Acc_train: 0.8668471647602823 - Acc_test: 0.7904991748442851 - Acc_val: 0.7738922501644975 Profundidad del árbol: 17. Acc_train: 0.8788938914577756 - Acc_test: 0.7875357123844339 - Acc_val: 0.7650013641251143
En nuestro caso se produce entre la profundidad de 8 y la de 11
En este ejemplo, podemos ver claramente como el árbol hace Overfit a medida que incrementamos la profundidad, y es por este motivo que tendremos que recurrir a mecanismos de poda
Apropiate fit:
El overfit funciona mejor en el train que en el test. Con el underfit todavía tenemos margen para mejorar el modelo
Tomamos como mejor profundidad la de 9
Accuracy variando el número mínimo de filas
Determinamos el accuracy de las particiones del train y del test variando el max_depth entre 1 y 18 y un min_samples_split de 500
for i in range(1, 18):
df_overfit = DecisionTreeClassifier(max_depth=i, random_state=42, min_samples_split=500) # Instanciamos el modelo con un mínimo de 500 registros por subgrupo
df_overfit.fit(X_train, y_train) # Entrenamos el modelo
train_accuracy = df_overfit.score(X_train, y_train) # acc del train
test_accuracy = df_overfit.score(X_test, y_test) # acc del test
val_accuracy = df_overfit.score(val_df_X, val_df_y) # acc del val
print('Profundidad del árbol: {}. Acc_train: {} - Acc_test: {} - Acc_val: {}'.format(i, train_accuracy, test_accuracy, val_accuracy))
Profundidad del árbol: 1. Acc_train: 0.7984835118033585 - Acc_test: 0.7979167036360088 - Acc_val: 0.8134679270112821 Profundidad del árbol: 2. Acc_train: 0.7984835118033585 - Acc_test: 0.7979167036360088 - Acc_val: 0.8134679270112821 Profundidad del árbol: 3. Acc_train: 0.8007802993429058 - Acc_test: 0.8005962415488084 - Acc_val: 0.815072780086983 Profundidad del árbol: 4. Acc_train: 0.8012822462886348 - Acc_test: 0.8011463453587209 - Acc_val: 0.814960440371684 Profundidad del árbol: 5. Acc_train: 0.8017765879776101 - Acc_test: 0.8013592887690096 - Acc_val: 0.8156023816019643 Profundidad del árbol: 6. Acc_train: 0.8031455341932343 - Acc_test: 0.8027256756516956 - Acc_val: 0.8154097992328803 Profundidad del árbol: 7. Acc_train: 0.8048871379897785 - Acc_test: 0.8031160719038916 - Acc_val: 0.8153937507021232 Profundidad del árbol: 8. Acc_train: 0.8067732416646386 - Acc_test: 0.8030273454829379 - Acc_val: 0.8143826932644317 Profundidad del árbol: 9. Acc_train: 0.8076706619615478 - Acc_test: 0.8022642982627367 - Acc_val: 0.8136284123188522 Profundidad del árbol: 10. Acc_train: 0.8085072402044293 - Acc_test: 0.803346760598371 - Acc_val: 0.8086694163149364 Profundidad del árbol: 11. Acc_train: 0.8090167924069117 - Acc_test: 0.803737156850567 - Acc_val: 0.8088940957455345 Profundidad del árbol: 12. Acc_train: 0.8091156607447068 - Acc_test: 0.8036484304296133 - Acc_val: 0.8087015133764504 Profundidad del árbol: 13. Acc_train: 0.8092145290825018 - Acc_test: 0.8034177417351339 - Acc_val: 0.8085249795381233 Profundidad del árbol: 14. Acc_train: 0.8095035288391336 - Acc_test: 0.803541958724469 - Acc_val: 0.8084607854150953 Profundidad del árbol: 15. Acc_train: 0.8096100024336822 - Acc_test: 0.8035774492928505 - Acc_val: 0.8079793294923849 Profundidad del árbol: 16. Acc_train: 0.8096252129471891 - Acc_test: 0.8035774492928505 - Acc_val: 0.808011426553899 Profundidad del árbol: 17. Acc_train: 0.8096860550012168 - Acc_test: 0.8034532323035153 - Acc_val: 0.807995378023142
Determinamos el accuracy de la partición de validación utilizando el estimador con una profundidad de 11 y un mínimo de 500 registros en cada subgrupo.
dt = DecisionTreeClassifier(max_depth = 11, random_state = 42, min_samples_split = 500) # Instancio el modelo
dt.fit(X_train, y_train) # Lo entreno
DecisionTreeClassifier(max_depth=11, min_samples_split=500, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(max_depth=11, min_samples_split=500, random_state=42)
acc_train = dt.score(X_train, y_train) # acc del train
acc_test = dt.score(X_test, y_test) # acc del test
acc_val = dt.score(val_df_X, val_df_y) # acc del val
# Comparamos las métricas accuracy de las tres particiones
pd.DataFrame({'Acc_train':[acc_train], 'Acc_test':[acc_test], 'Acc_val':[acc_val]})
| Acc_train | Acc_test | Acc_val | |
|---|---|---|---|
| 0 | 0.81 | 0.80 | 0.81 |
AUC para diferentes profundidades
Normalmente se mide con el AUC mejor que con el ACCURACY
NOTA: Nunca dejar más de 2 puntos de diferencia en el overfit en el AUC ( si tengo un 0,82 en train, el de test como mínimo 0,80 ). Si esto ocurre hay que ser MÁS RESTRICTIVOS --> tenemos que dar menos profundidad y más min_sample_split
# Para un min_samples_split de 2
for i in range(1, 18):
df_overfit = DecisionTreeClassifier(max_depth=i, random_state=42, min_samples_split=2)
df_overfit.fit(X_train, y_train) # Entrenamos el modelo
y_train_score = pd.DataFrame(df_overfit.predict_proba(X = X_train)[:,1], index = y_train.index, columns=['DelayScore'])
AUC_train = metrics.roc_auc_score(y_train, y_train_score)
y_test_score = pd.DataFrame(df_overfit.predict_proba(X = X_test)[:,1], index = y_test.index, columns=['DelayScore'])
AUC_test = metrics.roc_auc_score(y_test, y_test_score)
y_val_score = pd.DataFrame(df_overfit.predict_proba(X = val_df_X)[:,1], index = val_df_y.index, columns=['DelayScore'])
AUC_val = metrics.roc_auc_score(val_df_y, y_val_score)
print('Profundidad del árbol: {}. AUC_train: {} - AUC_test: {} - AUC_Val: {}'.format(i, AUC_train, AUC_test, AUC_val))
Profundidad del árbol: 1. AUC_train: 0.5602715929722952 - AUC_test: 0.5588769681522083 - AUC_Val: 0.5520433659355883 Profundidad del árbol: 2. AUC_train: 0.6310958859216068 - AUC_test: 0.6299719094198598 - AUC_Val: 0.6249751414436124 Profundidad del árbol: 3. AUC_train: 0.6623357304272056 - AUC_test: 0.6616678800926384 - AUC_Val: 0.655989065556006 Profundidad del árbol: 4. AUC_train: 0.6794643252716981 - AUC_test: 0.6795882513669739 - AUC_Val: 0.6714058443490203 Profundidad del árbol: 5. AUC_train: 0.6940423055578773 - AUC_test: 0.6936594988383418 - AUC_Val: 0.6794336923981718 Profundidad del árbol: 6. AUC_train: 0.705330023397478 - AUC_test: 0.7025370917248372 - AUC_Val: 0.6950269387666368 Profundidad del árbol: 7. AUC_train: 0.7139626157558411 - AUC_test: 0.7096309476853228 - AUC_Val: 0.6984662257292258 Profundidad del árbol: 8. AUC_train: 0.7223656300927249 - AUC_test: 0.7122414951182071 - AUC_Val: 0.7023453533851199 Profundidad del árbol: 9. AUC_train: 0.7316034794121453 - AUC_test: 0.7138938219994586 - AUC_Val: 0.6989655126172019 Profundidad del árbol: 10. AUC_train: 0.7439824515806714 - AUC_test: 0.7130502868156419 - AUC_Val: 0.6882860803412736 Profundidad del árbol: 11. AUC_train: 0.7576068243644773 - AUC_test: 0.7106891948678663 - AUC_Val: 0.6823542351221211 Profundidad del árbol: 12. AUC_train: 0.7741351243612673 - AUC_test: 0.7065532567310551 - AUC_Val: 0.670404636036892 Profundidad del árbol: 13. AUC_train: 0.791921283833079 - AUC_test: 0.697211896280369 - AUC_Val: 0.6545093823785613 Profundidad del árbol: 14. AUC_train: 0.8118310697491642 - AUC_test: 0.6868748994314218 - AUC_Val: 0.6379439433735526 Profundidad del árbol: 15. AUC_train: 0.8336621939113313 - AUC_test: 0.6747988678895278 - AUC_Val: 0.6238811942055856 Profundidad del árbol: 16. AUC_train: 0.8566241633566655 - AUC_test: 0.6575843370357122 - AUC_Val: 0.6035999590300353 Profundidad del árbol: 17. AUC_train: 0.8786371556681953 - AUC_test: 0.6490580361819488 - AUC_Val: 0.5893174277472518
Podemos apreciar que el AUC de la validación llega a un máximo de 0.7023 con una profundidad de 8, y la diferencia entre el train y el test es de 1 punto. Esta sería la mejor profundidad.
# Para un min_samples_split de 500
for i in range(1, 18):
df_overfit = DecisionTreeClassifier(max_depth=i, random_state=42, min_samples_split=500)
df_overfit.fit(X_train, y_train) # Entrenamos el modelo
y_train_score = pd.DataFrame(df_overfit.predict_proba(X = X_train)[:,1], index = y_train.index, columns=['DelayScore'])
AUC_train = metrics.roc_auc_score(y_train, y_train_score)
y_test_score = pd.DataFrame(df_overfit.predict_proba(X = X_test)[:,1], index = y_test.index, columns=['DelayScore'])
AUC_test = metrics.roc_auc_score(y_test, y_test_score)
y_val_score = pd.DataFrame(df_overfit.predict_proba(X = val_df_X)[:,1], index = val_df_y.index, columns=['DelayScore'])
AUC_val = metrics.roc_auc_score(val_df_y, y_val_score)
print('Profundidad del árbol: {}. AUC_train: {} - AUC_test: {} - AUC_Val: {}'.format(i, AUC_train, AUC_test, AUC_val))
Profundidad del árbol: 1. AUC_train: 0.5602715929722952 - AUC_test: 0.5588769681522083 - AUC_Val: 0.5520433659355883 Profundidad del árbol: 2. AUC_train: 0.6310958859216068 - AUC_test: 0.6299719094198598 - AUC_Val: 0.6249751414436124 Profundidad del árbol: 3. AUC_train: 0.6623357304272056 - AUC_test: 0.6616678800926384 - AUC_Val: 0.655989065556006 Profundidad del árbol: 4. AUC_train: 0.6794643252716981 - AUC_test: 0.6795882513669739 - AUC_Val: 0.6714058443490203 Profundidad del árbol: 5. AUC_train: 0.694008612631609 - AUC_test: 0.6936243264661723 - AUC_Val: 0.6795082611557426 Profundidad del árbol: 6. AUC_train: 0.7050699509152232 - AUC_test: 0.702713999169991 - AUC_Val: 0.6952820204136938 Profundidad del árbol: 7. AUC_train: 0.7128195488918901 - AUC_test: 0.7097155755411059 - AUC_Val: 0.698184591372657 Profundidad del árbol: 8. AUC_train: 0.7191814846595909 - AUC_test: 0.7124659789852767 - AUC_Val: 0.7036508172872761 Profundidad del árbol: 9. AUC_train: 0.7254215142479201 - AUC_test: 0.7155350398978527 - AUC_Val: 0.7046986259908332 Profundidad del árbol: 10. AUC_train: 0.7330131974203302 - AUC_test: 0.7186680412340927 - AUC_Val: 0.6983898404426991 Profundidad del árbol: 11. AUC_train: 0.7387416657476136 - AUC_test: 0.7208496683066462 - AUC_Val: 0.7000219299116517 Profundidad del árbol: 12. AUC_train: 0.7435581093670365 - AUC_test: 0.7218443460190287 - AUC_Val: 0.6992740655910844 Profundidad del árbol: 13. AUC_train: 0.7473080881366558 - AUC_test: 0.7214254526159731 - AUC_Val: 0.69771728563972 Profundidad del árbol: 14. AUC_train: 0.7506026473211294 - AUC_test: 0.7214433948052935 - AUC_Val: 0.6954233112392999 Profundidad del árbol: 15. AUC_train: 0.7539245986811574 - AUC_test: 0.7221126874290695 - AUC_Val: 0.693288119683687 Profundidad del árbol: 16. AUC_train: 0.757079156594042 - AUC_test: 0.7219915472834267 - AUC_Val: 0.6924828192219754 Profundidad del árbol: 17. AUC_train: 0.7587876492712966 - AUC_test: 0.7214389807389281 - AUC_Val: 0.6910047484782454
Podemos apreciar que el AUC de la validación llega a un máximo de 0.7047 con una profundidad de 9, y la diferencia entre el train y el test es de 1 punto. Esta sería la mejor profundidad.
Al meter una mayor restricción en el min_samples_split, el modelo tiene que aumentar su profundidad para conseguir el mejor AUC.
Consiste en aplicar el procedimiento de modelización del train/test split "k" veces y obtener la métrica de rendimiento promedio de los "k" modelos.
Utilizamos el módulo de model_selection de scikit-learn para aplicar el k-Fold.
Image('pictures/kfold.jpg')
En cada iteracción:
Nº muestras develoment / k
[Nº muestras develoment * ( k - 1 )] / k
Si hacemos 10 iteracciones (k=10):
# Número de muestras del test
round(len(dev_df)/10, 0)
18784.0
# Número de muestras del train
round((len(dev_df)*9)/10, 0)
169057.0
model_selection??
Type: module String form: <module 'sklearn.model_selection' from 'c:\\Users\\jagui\\anaconda3\\lib\\site-packages\\sklearn\\model_selection\\__init__.py'> File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\model_selection\__init__.py Source: import typing from ._plot import LearningCurveDisplay, ValidationCurveDisplay from ._search import GridSearchCV, ParameterGrid, ParameterSampler, RandomizedSearchCV from ._split import ( BaseCrossValidator, BaseShuffleSplit, GroupKFold, GroupShuffleSplit, KFold, LeaveOneGroupOut, LeaveOneOut, LeavePGroupsOut, LeavePOut, PredefinedSplit, RepeatedKFold, RepeatedStratifiedKFold, ShuffleSplit, StratifiedGroupKFold, StratifiedKFold, StratifiedShuffleSplit, TimeSeriesSplit, check_cv, train_test_split, ) from ._validation import ( cross_val_predict, cross_val_score, cross_validate, learning_curve, permutation_test_score, validation_curve, ) if typing.TYPE_CHECKING: # Avoid errors in type checkers (e.g. mypy) for experimental estimators. # TODO: remove this check once the estimator is no longer experimental. from ._search_successive_halving import ( # noqa HalvingGridSearchCV, HalvingRandomSearchCV, ) __all__ = [ "BaseCrossValidator", "BaseShuffleSplit", "GridSearchCV", "TimeSeriesSplit", "KFold", "GroupKFold", "GroupShuffleSplit", "LeaveOneGroupOut", "LeaveOneOut", "LeavePGroupsOut", "LeavePOut", "RepeatedKFold", "RepeatedStratifiedKFold", "ParameterGrid", "ParameterSampler", "PredefinedSplit", "RandomizedSearchCV", "ShuffleSplit", "StratifiedKFold", "StratifiedGroupKFold", "StratifiedShuffleSplit", "check_cv", "cross_val_predict", "cross_val_score", "cross_validate", "learning_curve", "LearningCurveDisplay", "permutation_test_score", "train_test_split", "validation_curve", "ValidationCurveDisplay", ] # TODO: remove this check once the estimator is no longer experimental. def __getattr__(name): if name in {"HalvingGridSearchCV", "HalvingRandomSearchCV"}: raise ImportError( f"{name} is experimental and the API might change without any " "deprecation cycle. To use it, you need to explicitly import " "enable_halving_search_cv:\n" "from sklearn.experimental import enable_halving_search_cv" ) raise AttributeError(f"module {__name__} has no attribute {name}")
model_selection.KFold??
Init signature: model_selection.KFold(n_splits=5, *, shuffle=False, random_state=None) Source: class KFold(_BaseKFold): """K-Folds cross-validator Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default). Each fold is then used once as a validation while the k - 1 remaining folds form the training set. Read more in the :ref:`User Guide <k_fold>`. Parameters ---------- n_splits : int, default=5 Number of folds. Must be at least 2. .. versionchanged:: 0.22 ``n_splits`` default value changed from 3 to 5. shuffle : bool, default=False Whether to shuffle the data before splitting into batches. Note that the samples within each split will not be shuffled. random_state : int, RandomState instance or None, default=None When `shuffle` is True, `random_state` affects the ordering of the indices, which controls the randomness of each fold. Otherwise, this parameter has no effect. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import KFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([1, 2, 3, 4]) >>> kf = KFold(n_splits=2) >>> kf.get_n_splits(X) 2 >>> print(kf) KFold(n_splits=2, random_state=None, shuffle=False) >>> for i, (train_index, test_index) in enumerate(kf.split(X)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[2 3] Test: index=[0 1] Fold 1: Train: index=[0 1] Test: index=[2 3] Notes ----- The first ``n_samples % n_splits`` folds have size ``n_samples // n_splits + 1``, other folds have size ``n_samples // n_splits``, where ``n_samples`` is the number of samples. Randomized CV splitters may return different results for each call of split. You can make the results identical by setting `random_state` to an integer. See Also -------- StratifiedKFold : Takes class information into account to avoid building folds with imbalanced class distributions (for binary or multiclass classification tasks). GroupKFold : K-fold iterator variant with non-overlapping groups. RepeatedKFold : Repeats K-Fold n times. """ def __init__(self, n_splits=5, *, shuffle=False, random_state=None): super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) def _iter_test_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) indices = np.arange(n_samples) if self.shuffle: check_random_state(self.random_state).shuffle(indices) n_splits = self.n_splits fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int) fold_sizes[: n_samples % n_splits] += 1 current = 0 for fold_size in fold_sizes: start, stop = current, current + fold_size yield indices[start:stop] current = stop File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\model_selection\_split.py Type: ABCMeta Subclasses:
Vamos a hacer 10 iteracciones.
# Defino el cross validation
kf = model_selection.KFold(
n_splits = 10,
# k=10 (10 iteracciones con diferentes particiones aleatorias Train/Test)
# Tengo 10 datasets distintos --> 10 modelos distintos
random_state = 42,
# Si shuffle=True, random_state afectaría el orden de los índices, que controla la aleatoriedad de los fold
# Si shuffle=False, random_state no tiene efecto
shuffle = True # mezclar los datos antes de dividirlos en batches.
)
# Genera los índices de las muestras del train y del test
kf.split(
dev_df,
y = None, # array del target
groups = None # array de etiquetas para las muestras usadas en el split
)
<generator object _BaseKFold.split at 0x000001E2EDB8AC80>
# Indices del train y del train de las 10 iteracciones
for train_index, test_index in kf.split(dev_df):
print("TRAIN:", train_index, "Samples_Train:", len(train_index), '\n')
print("TEST:", test_index, "Samples_Test:", len(test_index))
TRAIN: [ 0 1 2 ... 187836 187838 187839] Samples_Train: 169056 TEST: [ 6 12 22 ... 187835 187837 187840] Samples_Test: 18785 TRAIN: [ 1 2 3 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 0 11 20 ... 187801 187810 187813] Samples_Test: 18784 TRAIN: [ 0 1 2 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 24 30 42 ... 187825 187827 187829] Samples_Test: 18784 TRAIN: [ 0 1 2 ... 187837 187838 187840] Samples_Train: 169057 TEST: [ 4 23 39 ... 187808 187811 187839] Samples_Test: 18784 TRAIN: [ 0 1 2 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 3 21 41 ... 187822 187832 187834] Samples_Test: 18784 TRAIN: [ 0 1 2 ... 187837 187839 187840] Samples_Train: 169057 TEST: [ 8 14 25 ... 187824 187830 187838] Samples_Test: 18784 TRAIN: [ 0 2 3 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 1 7 17 ... 187826 187828 187833] Samples_Test: 18784 TRAIN: [ 0 1 3 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 2 9 13 ... 187782 187809 187817] Samples_Test: 18784 TRAIN: [ 0 1 2 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 10 16 18 ... 187803 187831 187836] Samples_Test: 18784 TRAIN: [ 0 1 2 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 5 15 26 ... 187799 187812 187821] Samples_Test: 18784
scores_list = [] # almaceno los acc de los modelos
for train_index, test_index in kf.split(dev_df):
print("TRAIN:", train_index, "Samples_Train:", len(train_index)) # índices y número de filas del train
print("TEST:", test_index, "Samples_Test:", len(test_index)) # índices y número de filas del test
X_train, X_test = dev_df_X.iloc[train_index], dev_df_X.iloc[test_index]
y_train, y_test = dev_df_y.iloc[train_index], dev_df_y.iloc[test_index]
dt = DecisionTreeClassifier(max_depth = 9, random_state = 42) # Instancio el modelo
dt.fit(X_train, y_train) # Entreno el modelo
_score = dt.score(X_test, y_test) # Determino el accuracy del modelo
scores_list.append(_score) # Almaceno el accuracy del modelo en una lista
print("Acc:",_score,"\n") # Imprimo el acc del modelo
print("Accuracy de cada iteracción:")
scores_list # lista de acc de todos los modelos
TRAIN: [ 0 1 2 ... 187836 187838 187839] Samples_Train: 169056 TEST: [ 6 12 22 ... 187835 187837 187840] Samples_Test: 18785 Acc: 0.8011179132286399 TRAIN: [ 1 2 3 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 0 11 20 ... 187801 187810 187813] Samples_Test: 18784 Acc: 0.8040353492333902 TRAIN: [ 0 1 2 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 24 30 42 ... 187825 187827 187829] Samples_Test: 18784 Acc: 0.806697189097104 TRAIN: [ 0 1 2 ... 187837 187838 187840] Samples_Train: 169057 TEST: [ 4 23 39 ... 187808 187811 187839] Samples_Test: 18784 Acc: 0.8045144804088586 TRAIN: [ 0 1 2 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 3 21 41 ... 187822 187832 187834] Samples_Test: 18784 Acc: 0.8030238500851788 TRAIN: [ 0 1 2 ... 187837 187839 187840] Samples_Train: 169057 TEST: [ 8 14 25 ... 187824 187830 187838] Samples_Test: 18784 Acc: 0.806324531516184 TRAIN: [ 0 2 3 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 1 7 17 ... 187826 187828 187833] Samples_Test: 18784 Acc: 0.798818143100511 TRAIN: [ 0 1 3 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 2 9 13 ... 187782 187809 187817] Samples_Test: 18784 Acc: 0.8040885860306644 TRAIN: [ 0 1 2 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 10 16 18 ... 187803 187831 187836] Samples_Test: 18784 Acc: 0.8048871379897785 TRAIN: [ 0 1 2 ... 187838 187839 187840] Samples_Train: 169057 TEST: [ 5 15 26 ... 187799 187812 187821] Samples_Test: 18784 Acc: 0.805951873935264 Accuracy de cada iteracción:
[0.8011179132286399, 0.8040353492333902, 0.806697189097104, 0.8045144804088586, 0.8030238500851788, 0.806324531516184, 0.798818143100511, 0.8040885860306644, 0.8048871379897785, 0.805951873935264]
# Determino el accuracy promedio de todos los modelos
kf_Acc = np.mean(scores_list)
kf_Acc
0.8039459054625574
# Desviación típica de los accuracy
kf_Acc_std = np.std(scores_list)
kf_Acc_std
0.0023158272854942007
print("Accuracy: %0.3f (+/- %0.3f)" % (kf_Acc, kf_Acc_std*2))
Accuracy: 0.804 (+/- 0.005)
Otra manera de calcular el accuracy del k-Fold
model_selection??
Type: module String form: <module 'sklearn.model_selection' from 'c:\\Users\\jagui\\anaconda3\\lib\\site-packages\\sklearn\\model_selection\\__init__.py'> File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\model_selection\__init__.py Source: import typing from ._plot import LearningCurveDisplay, ValidationCurveDisplay from ._search import GridSearchCV, ParameterGrid, ParameterSampler, RandomizedSearchCV from ._split import ( BaseCrossValidator, BaseShuffleSplit, GroupKFold, GroupShuffleSplit, KFold, LeaveOneGroupOut, LeaveOneOut, LeavePGroupsOut, LeavePOut, PredefinedSplit, RepeatedKFold, RepeatedStratifiedKFold, ShuffleSplit, StratifiedGroupKFold, StratifiedKFold, StratifiedShuffleSplit, TimeSeriesSplit, check_cv, train_test_split, ) from ._validation import ( cross_val_predict, cross_val_score, cross_validate, learning_curve, permutation_test_score, validation_curve, ) if typing.TYPE_CHECKING: # Avoid errors in type checkers (e.g. mypy) for experimental estimators. # TODO: remove this check once the estimator is no longer experimental. from ._search_successive_halving import ( # noqa HalvingGridSearchCV, HalvingRandomSearchCV, ) __all__ = [ "BaseCrossValidator", "BaseShuffleSplit", "GridSearchCV", "TimeSeriesSplit", "KFold", "GroupKFold", "GroupShuffleSplit", "LeaveOneGroupOut", "LeaveOneOut", "LeavePGroupsOut", "LeavePOut", "RepeatedKFold", "RepeatedStratifiedKFold", "ParameterGrid", "ParameterSampler", "PredefinedSplit", "RandomizedSearchCV", "ShuffleSplit", "StratifiedKFold", "StratifiedGroupKFold", "StratifiedShuffleSplit", "check_cv", "cross_val_predict", "cross_val_score", "cross_validate", "learning_curve", "LearningCurveDisplay", "permutation_test_score", "train_test_split", "validation_curve", "ValidationCurveDisplay", ] # TODO: remove this check once the estimator is no longer experimental. def __getattr__(name): if name in {"HalvingGridSearchCV", "HalvingRandomSearchCV"}: raise ImportError( f"{name} is experimental and the API might change without any " "deprecation cycle. To use it, you need to explicitly import " "enable_halving_search_cv:\n" "from sklearn.experimental import enable_halving_search_cv" ) raise AttributeError(f"module {__name__} has no attribute {name}")
model_selection.cross_val_score??
Signature: model_selection.cross_val_score( estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=nan, ) Source: def cross_val_score( estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch="2*n_jobs", error_score=np.nan, ): """Evaluate a score by cross-validation. Read more in the :ref:`User Guide <cross_validation>`. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape (n_samples, n_features) The data to fit. Can be for example a list, or an array. y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ default=None The target variable to try to predict in the case of supervised learning. groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). scoring : str or callable, default=None A str (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)`` which should return only a single value. Similar to :func:`cross_validate` but only a single metric is permitted. If `None`, the estimator's default scorer (if available) is used. cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - `None`, to use the default 5-fold cross validation, - int, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable that generates (train, test) splits as arrays of indices. For `int`/`None` inputs, if the estimator is a classifier and `y` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. These splitters are instantiated with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide <cross_validation>` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22 `cv` default value if `None` changed from 3-fold to 5-fold. n_jobs : int, default=None Number of jobs to run in parallel. Training the estimator and computing the score are parallelized over the cross-validation splits. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. verbose : int, default=0 The verbosity level. fit_params : dict, default=None Parameters to pass to the fit method of the estimator. pre_dispatch : int or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - ``None``, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' error_score : 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. .. versionadded:: 0.20 Returns ------- scores : ndarray of float of shape=(len(list(cv)),) Array of scores of the estimator for each run of the cross validation. See Also -------- cross_validate : To run cross-validation on multiple metrics and also to return train scores, fit times and score times. cross_val_predict : Get predictions from each split of cross-validation for diagnostic purposes. sklearn.metrics.make_scorer : Make a scorer from a performance metric or loss function. Examples -------- >>> from sklearn import datasets, linear_model >>> from sklearn.model_selection import cross_val_score >>> diabetes = datasets.load_diabetes() >>> X = diabetes.data[:150] >>> y = diabetes.target[:150] >>> lasso = linear_model.Lasso() >>> print(cross_val_score(lasso, X, y, cv=3)) [0.3315057 0.08022103 0.03531816] """ # To ensure multimetric format is not supported scorer = check_scoring(estimator, scoring=scoring) cv_results = cross_validate( estimator=estimator, X=X, y=y, groups=groups, scoring={"score": scorer}, cv=cv, n_jobs=n_jobs, verbose=verbose, fit_params=fit_params, pre_dispatch=pre_dispatch, error_score=error_score, ) return cv_results["test_score"] File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py Type: function
dt = DecisionTreeClassifier(max_depth = 9, random_state = 42) # Instancio el modelo
# Array de los accuracy de las 10 iteracciones (k = 10)
scores = model_selection.cross_val_score(
estimator = dt, # modelo
X = dev_df_X, # atributos sin target
y = dev_df_y, # target
groups = None,
scoring = None,
cv = kf, # estrategia de validación
n_jobs = None,
verbose = 0,
fit_params = None,
pre_dispatch = '2*n_jobs',
error_score = np.nan
)
scores
array([0.80111791, 0.80403535, 0.80669719, 0.80451448, 0.80302385,
0.80632453, 0.79881814, 0.80408859, 0.80488714, 0.80595187])
# Valor promedio de los 10 accuracy
scores.mean()
0.8039459054625574
# Desviación típica
scores.std()
0.0023158272854942007
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.804 (+/- 0.005)
Los métodos de ensamble consisten en combinar distintos modelos de base (estimadores de base o base learners) en un nuevo modelo (metamodelo o ensamble) que considera el resultado de todos estos para dar una predicción, esperando que la predicción combinada mejore la predicción de cada uno por separado.
Image('pictures/metodo_ensambles.jpg')
Combinamos algoritmos de diferentes familias o algoritmos de una misma familia o el mismo algoritmo con diferentes settings, trabajando a la vez sobre una misma tarea. Por ejemplo, podemos usar:
VENTAJA: Se trata de un método mucho más preciso que el DecisionTree, sin perjudicar la capacidad de generalización.
INCONVENIENTE: Necesitan mucha capacidad de computación.
Los estimadores de base deben cumplir:
Técnicas para construir el meta_modelo:
Del dataset original train/test se realizan N réplicas bootstrap o estimadores, cada una de las cuales tiene el mismo número de muestras aleatorias que el dataset train/test. En cada réplica pueden haber muestras repetidas. Todas estas muestras formarán el train del estimador y el resto de muestras del dataset train/test formará el test. A cada una de las réplicas le aplicamos el DecisionTree (con la misma configuración en todas), obteniendo un conjunto de estimadores ( "bosque de árboles") con una predicción cada uno de ellos. Finalmente aplicamos la técnica del voting a estas predicciones (promediando los resultados o asignando la clase mayoritaria).
Image('pictures/random_forest.jpg')
Usaremos RandomForestClassifier del método ensemble de sckit-learn
RandomForestClassifier??
Init signature: RandomForestClassifier( n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None, ) Source: class RandomForestClassifier(ForestClassifier): """ A random forest classifier. A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the `max_samples` parameter if `bootstrap=True` (default), otherwise the whole dataset is used to build each tree. For a comparison between tree-based ensemble models see the example :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`. Read more in the :ref:`User Guide <forest>`. Parameters ---------- n_estimators : int, default=100 The number of trees in the forest. .. versionchanged:: 0.22 The default value of ``n_estimators`` changed from 10 to 100 in 0.22. criterion : {"gini", "entropy", "log_loss"}, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "log_loss" and "entropy" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific. max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for fractions. min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions. min_weight_fraction_leaf : float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. max_features : {"sqrt", "log2", None}, int or float, default="sqrt" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `max(1, int(max_features * n_features_in_))` features are considered at each split. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1 The default of `max_features` changed from `"auto"` to `"sqrt"`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 bootstrap : bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree. oob_score : bool or callable, default=False Whether to use out-of-bag samples to estimate the generalization score. By default, :func:`~sklearn.metrics.accuracy_score` is used. Provide a callable with signature `metric(y_true, y_pred)` to use a custom metric. Only available if `bootstrap=True`. n_jobs : int, default=None The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, :meth:`decision_path` and :meth:`apply` are all parallelized over the trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. random_state : int, RandomState instance or None, default=None Controls both the randomness of the bootstrapping of the samples used when building trees (if ``bootstrap=True``) and the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``). See :term:`Glossary <random_state>` for details. verbose : int, default=0 Controls the verbosity when fitting and predicting. warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`Glossary <warm_start>` and :ref:`gradient_boosting_warm_start` for details. class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \ default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. Note that for multioutput (including multilabel) weights should be defined for each class of every column in its own dict. For example, for four-class multilabel classification weights should be [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of [{1:1}, {2:5}, {3:1}, {4:1}]. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` The "balanced_subsample" mode is the same as "balanced" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See :ref:`minimal_cost_complexity_pruning` for details. .. versionadded:: 0.22 max_samples : int or float, default=None If bootstrap is True, the number of samples to draw from X to train each base estimator. - If None (default), then draw `X.shape[0]` samples. - If int, then draw `max_samples` samples. - If float, then draw `max(round(n_samples * max_samples), 1)` samples. Thus, `max_samples` should be in the interval `(0.0, 1.0]`. .. versionadded:: 0.22 Attributes ---------- estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` The child estimator template used to create the collection of fitted sub-estimators. .. versionadded:: 1.2 `base_estimator_` was renamed to `estimator_`. base_estimator_ : DecisionTreeClassifier The child estimator template used to create the collection of fitted sub-estimators. .. deprecated:: 1.2 `base_estimator_` is deprecated and will be removed in 1.4. Use `estimator_` instead. estimators_ : list of DecisionTreeClassifier The collection of fitted sub-estimators. classes_ : ndarray of shape (n_classes,) or a list of such arrays The classes labels (single output problem), or a list of arrays of class labels (multi-output problem). n_classes_ : int or list The number of classes (single output problem), or a list containing the number of classes for each output (multi-output problem). n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 n_outputs_ : int The number of outputs when ``fit`` is performed. feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). See :func:`sklearn.inspection.permutation_importance` as an alternative. oob_score_ : float Score of the training dataset obtained using an out-of-bag estimate. This attribute exists only when ``oob_score`` is True. oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \ (n_samples, n_classes, n_outputs) Decision function computed with out-of-bag estimate on the training set. If n_estimators is small it might be possible that a data point was never left out during the bootstrap. In this case, `oob_decision_function_` might contain NaN. This attribute exists only when ``oob_score`` is True. See Also -------- sklearn.tree.DecisionTreeClassifier : A decision tree classifier. sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized tree classifiers. sklearn.ensemble.HistGradientBoostingClassifier : A Histogram-based Gradient Boosting Classification Tree, very fast for big datasets (n_samples >= 10_000). Notes ----- The default values for the parameters controlling the size of the trees (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and unpruned trees which can potentially be very large on some data sets. To reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. The features are always randomly permuted at each split. Therefore, the best found split may vary, even with the same training data, ``max_features=n_features`` and ``bootstrap=False``, if the improvement of the criterion is identical for several splits enumerated during the search of the best split. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed. References ---------- .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. Examples -------- >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=1000, n_features=4, ... n_informative=2, n_redundant=0, ... random_state=0, shuffle=False) >>> clf = RandomForestClassifier(max_depth=2, random_state=0) >>> clf.fit(X, y) RandomForestClassifier(...) >>> print(clf.predict([[0, 0, 0, 0]])) [1] """ _parameter_constraints: dict = { **ForestClassifier._parameter_constraints, **DecisionTreeClassifier._parameter_constraints, "class_weight": [ StrOptions({"balanced_subsample", "balanced"}), dict, list, None, ], } _parameter_constraints.pop("splitter") def __init__( self, n_estimators=100, *, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="sqrt", max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None, ): super().__init__( estimator=DecisionTreeClassifier(), n_estimators=n_estimators, estimator_params=( "criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", "min_impurity_decrease", "random_state", "ccp_alpha", ), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, ) self.criterion = criterion self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.ccp_alpha = ccp_alpha File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py Type: ABCMeta Subclasses:
# Instancio el modelo
rf = RandomForestClassifier(
n_estimators = 3, # número de réplicas o estimadores (3 DecisinTree distintos)
criterion = 'gini', # Por defecto
max_depth = 5, # profundidad del árbol
min_samples_split = 2, # Por defecto
min_samples_leaf = 1, # Por defecto
min_weight_fraction_leaf = 0.0, # Por defecto
max_features = 'sqrt', # Por defecto
max_leaf_nodes = None, # Por defecto
min_impurity_decrease = 0.0, # Por defecto
bootstrap = True, # Por defecto
oob_score = False, # Por defecto
n_jobs = None, # Por defecto
random_state = 42,
verbose = 0, # Por defecto
warm_start = False, # Por defecto
class_weight = None, # Por defecto
ccp_alpha = 0.0, # Por defecto
max_samples = None # Por defecto
)
RandomForestClassifier.fit??
Signature: RandomForestClassifier.fit(self, X, y, sample_weight=None) Source: @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """ Build a forest of trees from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object Fitted estimator. """ # Validate or convert input data if issparse(y): raise ValueError("sparse multilabel-indicator for y is not supported.") X, y = self._validate_data( X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE ) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( ( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel()." ), DataConversionWarning, stacklevel=2, ) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) if self.criterion == "poisson": if np.any(y < 0): raise ValueError( "Some value(s) of y are negative which is " "not allowed for Poisson regression." ) if np.sum(y) <= 0: raise ValueError( "Sum of y is not strictly positive which " "is necessary for Poisson regression." ) self.n_outputs_ = y.shape[1] y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight if not self.bootstrap and self.max_samples is not None: raise ValueError( "`max_sample` cannot be set if `bootstrap=False`. " "Either switch to `bootstrap=True` or set " "`max_sample=None`." ) elif self.bootstrap: n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples ) else: n_samples_bootstrap = None self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " "len(estimators_)=%d when warm_start==True" % (self.n_estimators, len(self.estimators_)) ) elif n_more_estimators == 0: warn( "Warm-start fitting without increasing n_estimators does not " "fit new trees." ) else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [ self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators) ] # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading more efficient than multiprocessing in # that case. However, for joblib 0.12+ we respect any # parallel_backend contexts set at a higher level, # since correctness does not rely on using threads. trees = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, prefer="threads", )( delayed(_parallel_build_trees)( t, self.bootstrap, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap, ) for i, t in enumerate(trees) ) # Collect newly grown trees self.estimators_.extend(trees) if self.oob_score and ( n_more_estimators > 0 or not hasattr(self, "oob_score_") ): y_type = type_of_target(y) if y_type in ("multiclass-multioutput", "unknown"): # FIXME: we could consider to support multiclass-multioutput if # we introduce or reuse a constructor parameter (e.g. # oob_score) allowing our user to pass a callable defining the # scoring strategy on OOB sample. raise ValueError( "The type of target cannot be used to compute OOB " f"estimates. Got {y_type} while only the following are " "supported: continuous, continuous-multioutput, binary, " "multiclass, multilabel-indicator." ) if callable(self.oob_score): self._set_oob_score_and_attributes( X, y, scoring_function=self.oob_score ) else: self._set_oob_score_and_attributes(X, y) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py Type: function
Para entrenar el modelo le tenemos que pasar el y_train en 1D
y_train.shape
(169057, 1)
# Me devuelve un array de 1D (vector)
np.ravel(y_train).shape
(169057,)
np.ravel(y_train)
array([0., 0., 0., ..., 0., 0., 0.])
# Entreno el modelo
rf.fit(
X = X_train, # train sólo con atributos
y = np.ravel(y_train) # train sólo con el target
)
RandomForestClassifier(max_depth=5, n_estimators=3, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=5, n_estimators=3, random_state=42)
Una vez que se ha entrenado el modelo, podemos acceder a la información de los estimadores (modelos base) y trabajar con ellos como si fuesen modelos independientes.
Podemos trabajar con la información que nos aportan los estimadores, para ver cuáles son los atributos más importantes, de varias formas:
# Estimadores generados (modelos base)
tree_list = rf.estimators_
tree_list
[DecisionTreeClassifier(max_depth=5, max_features='sqrt',
random_state=1608637542),
DecisionTreeClassifier(max_depth=5, max_features='sqrt',
random_state=1273642419),
DecisionTreeClassifier(max_depth=5, max_features='sqrt',
random_state=1935803228)]
export_graphviz??
Signature: export_graphviz( decision_tree, out_file=None, *, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, leaves_parallel=False, impurity=True, node_ids=False, proportion=False, rotate=False, rounded=False, special_characters=False, precision=3, fontname='helvetica', ) Source: @validate_params( { "decision_tree": "no_validation", "out_file": [str, None, HasMethods("write")], "max_depth": [Interval(Integral, 0, None, closed="left"), None], "feature_names": ["array-like", None], "class_names": ["array-like", "boolean", None], "label": [StrOptions({"all", "root", "none"})], "filled": ["boolean"], "leaves_parallel": ["boolean"], "impurity": ["boolean"], "node_ids": ["boolean"], "proportion": ["boolean"], "rotate": ["boolean"], "rounded": ["boolean"], "special_characters": ["boolean"], "precision": [Interval(Integral, 0, None, closed="left"), None], "fontname": [str], }, prefer_skip_nested_validation=True, ) def export_graphviz( decision_tree, out_file=None, *, max_depth=None, feature_names=None, class_names=None, label="all", filled=False, leaves_parallel=False, impurity=True, node_ids=False, proportion=False, rotate=False, rounded=False, special_characters=False, precision=3, fontname="helvetica", ): """Export a decision tree in DOT format. This function generates a GraphViz representation of the decision tree, which is then written into `out_file`. Once exported, graphical renderings can be generated using, for example:: $ dot -Tps tree.dot -o tree.ps (PostScript format) $ dot -Tpng tree.dot -o tree.png (PNG format) The sample counts that are shown are weighted with any sample_weights that might be present. Read more in the :ref:`User Guide <tree>`. Parameters ---------- decision_tree : object The decision tree estimator to be exported to GraphViz. out_file : object or str, default=None Handle or name of the output file. If ``None``, the result is returned as a string. .. versionchanged:: 0.20 Default of out_file changed from "tree.dot" to None. max_depth : int, default=None The maximum depth of the representation. If None, the tree is fully generated. feature_names : array-like of shape (n_features,), default=None An array containing the feature names. If None, generic names will be used ("x[0]", "x[1]", ...). class_names : array-like of shape (n_classes,) or bool, default=None Names of each of the target classes in ascending numerical order. Only relevant for classification and not supported for multi-output. If ``True``, shows a symbolic representation of the class name. label : {'all', 'root', 'none'}, default='all' Whether to show informative labels for impurity, etc. Options include 'all' to show at every node, 'root' to show only at the top root node, or 'none' to not show at any node. filled : bool, default=False When set to ``True``, paint nodes to indicate majority class for classification, extremity of values for regression, or purity of node for multi-output. leaves_parallel : bool, default=False When set to ``True``, draw all leaf nodes at the bottom of the tree. impurity : bool, default=True When set to ``True``, show the impurity at each node. node_ids : bool, default=False When set to ``True``, show the ID number on each node. proportion : bool, default=False When set to ``True``, change the display of 'values' and/or 'samples' to be proportions and percentages respectively. rotate : bool, default=False When set to ``True``, orient tree left to right rather than top-down. rounded : bool, default=False When set to ``True``, draw node boxes with rounded corners. special_characters : bool, default=False When set to ``False``, ignore special characters for PostScript compatibility. precision : int, default=3 Number of digits of precision for floating point in the values of impurity, threshold and value attributes of each node. fontname : str, default='helvetica' Name of font used to render text. Returns ------- dot_data : str String representation of the input tree in GraphViz dot format. Only returned if ``out_file`` is None. .. versionadded:: 0.18 Examples -------- >>> from sklearn.datasets import load_iris >>> from sklearn import tree >>> clf = tree.DecisionTreeClassifier() >>> iris = load_iris() >>> clf = clf.fit(iris.data, iris.target) >>> tree.export_graphviz(clf) 'digraph Tree {... """ if feature_names is not None: feature_names = check_array( feature_names, ensure_2d=False, dtype=None, ensure_min_samples=0 ) if class_names is not None and not isinstance(class_names, bool): class_names = check_array( class_names, ensure_2d=False, dtype=None, ensure_min_samples=0 ) check_is_fitted(decision_tree) own_file = False return_string = False try: if isinstance(out_file, str): out_file = open(out_file, "w", encoding="utf-8") own_file = True if out_file is None: return_string = True out_file = StringIO() exporter = _DOTTreeExporter( out_file=out_file, max_depth=max_depth, feature_names=feature_names, class_names=class_names, label=label, filled=filled, leaves_parallel=leaves_parallel, impurity=impurity, node_ids=node_ids, proportion=proportion, rotate=rotate, rounded=rounded, special_characters=special_characters, precision=precision, fontname=fontname, ) exporter.export(decision_tree) if return_string: return exporter.out_file.getvalue() finally: if own_file: out_file.close() File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\tree\_export.py Type: function
# Visualización del estimador 1
plt.figure(figsize=(8,8))
# Exporto los datos a un formato DOT
dot_data_tree_1 = export_graphviz(
decision_tree = tree_list[0],
out_file = None,
max_depth=None,
feature_names = X_test.columns,
class_names = ['No Delay', 'Delay'],
label = 'all',
filled = True,
leaves_parallel = False,
impurity = True,
node_ids = False,
proportion = True,
rotate = True,
rounded = True,
special_characters = False,
precision = 4,
fontname='helvetica',
)
# Grafico los estimadores
graph_tree_1 = graphviz.Source(dot_data_tree_1)
graph_tree_1
<Figure size 800x800 with 0 Axes>
# Visualización del estimador 2
plt.figure(figsize=(8,8))
# Exporto los datos a un formato DOT
dot_data_tree_2 = export_graphviz(
decision_tree = tree_list[1],
out_file = None,
max_depth=None,
feature_names = X_test.columns,
class_names = ['No Delay', 'Delay'],
label = 'all',
filled = True,
leaves_parallel = False,
impurity = True,
node_ids = False,
proportion = True,
rotate = True,
rounded = True,
special_characters = False,
precision = 4,
fontname='helvetica',
)
# Grafico los estimadores
graph_tree_2 = graphviz.Source(dot_data_tree_2)
graph_tree_2
<Figure size 800x800 with 0 Axes>
# Visualización del estimador 3
plt.figure(figsize=(8,8))
# Exporto los datos a un formato DOT
dot_data_tree_3 = export_graphviz(
decision_tree = tree_list[2],
out_file = None,
max_depth=None,
feature_names = X_test.columns,
class_names = ['No Delay', 'Delay'],
label = 'all',
filled = True,
leaves_parallel = False,
impurity = True,
node_ids = False,
proportion = True,
rotate = True,
rounded = True,
special_characters = False,
precision = 4,
fontname='helvetica',
)
# Grafico los estimadores
graph_tree_3 = graphviz.Source(dot_data_tree_3)
graph_tree_3
<Figure size 800x800 with 0 Axes>
Una alternativa a la visualización de importancia de atributos es el Feature Importance, que promedia el Information Gain del predictor en todo el ensamble.
Las puntuaciones de las feature importance desempeñan un papel importante en un modelo predictivo, ya que nos dan información sobre los datos, información sobre el modelo y la base para la reducción de la dimensionalidad y la selección de características que pueden mejorar la eficiencia y la eficacia de un modelo predictivo sobre el problema:
El algoritmo encuentran un conjunto de coeficientes para usar en la suma ponderada para hacer una predicción. Estos coeficientes se pueden usar directamente como un tipo crudo de puntaje de feature importance.
# Coeficientes de los feature_importances
feature_importances = rf.feature_importances_
feature_importances
array([2.27398889e-02, 6.69160161e-01, 1.34848127e-01, 2.68759007e-03,
1.03055085e-03, 1.87339281e-03, 2.77998698e-03, 5.32593673e-02,
3.24476563e-03, 3.40448092e-03, 5.28790228e-03, 0.00000000e+00,
2.17361031e-02, 0.00000000e+00, 0.00000000e+00, 1.10740547e-03,
1.17111861e-03, 0.00000000e+00, 0.00000000e+00, 4.22089097e-02,
0.00000000e+00, 0.00000000e+00, 8.81353759e-05, 0.00000000e+00,
0.00000000e+00, 1.84936063e-03, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 1.07649666e-04, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 2.49191426e-03, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
1.79804620e-03, 0.00000000e+00, 3.64558140e-04, 4.40951312e-03,
0.00000000e+00, 3.24110179e-04, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 3.27102352e-03, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 7.88271066e-04, 0.00000000e+00, 1.00832053e-03,
1.28824429e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
6.93124595e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 1.41526331e-03, 0.00000000e+00, 1.07704992e-03,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 4.24872686e-04, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 6.19087250e-04, 1.54783395e-03,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 9.04662327e-04, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.94593548e-04,
0.00000000e+00, 1.48906555e-03, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 2.67427612e-04, 0.00000000e+00,
0.00000000e+00])
Vemos cuáles son los atributos más importantes, con sus coeficientes de ponderación, que usa el modelo para hacer la predicción.
# Atributos más importantes
rf_feature_importances = pd.Series(
feature_importances,
index = X_train.columns
).sort_values(ascending = False).head(20)
rf_feature_importances
TAXI_OUT 0.67 CRS_ARR_TIME 0.13 WEEK 0.05 OP_CARRIER_AIRLINE_ID_20409.0 0.04 CRS_DEP_TIME 0.02 OP_CARRIER_AIRLINE_ID_19790.0 0.02 ORIGIN_CITY_NAME_Chicago, IL 0.01 TAIL_NUM_COUNT 0.01 ORIGIN_MIA 0.00 WEEKEND 0.00 ORIGIN_ORD 0.00 WEEKDAY 0.00 MONTH 0.00 CRS_ELAPSED_TIME 0.00 ORIGIN_CLT 0.00 YEAR 0.00 ORIGIN_ATL 0.00 ORIGIN_LAX 0.00 ORIGIN_CITY_NAME_West Palm Beach/Palm Beach, FL 0.00 ORIGIN_STATE_NM_Oregon 0.00 dtype: float64
# Visualizamos los atributos importantes
plt.figure(figsize=(15,5))
rf_feature_importances.plot(kind = 'bar', ylabel = 'Coeficientes de ponderación')
plt.title('Feature Importances');
Control del overfiting
# Vemos donde se produce el overfitting
for i in range(1, 18):
df_overfit = DecisionTreeClassifier(max_depth=i, random_state=42, min_samples_split = 500) # Instanciamos el modelo
df_overfit.fit(X_train, y_train) # Entrenamos el modelo
train_accuracy = df_overfit.score(X_train, y_train) # acc del train
test_accuracy = df_overfit.score(X_test, y_test) # acc del test
val_accuracy = df_overfit.score(val_df_X, val_df_y) # acc del val
print('Profundidad del árbol: {}. Acc_train: {} - Acc_test: {} - Acc_val: {}'.format(i, train_accuracy, test_accuracy, val_accuracy))
Profundidad del árbol: 1. Acc_train: 0.794678717828898 - Acc_test: 0.7963160136286201 - Acc_val: 0.8106273370672915 Profundidad del árbol: 2. Acc_train: 0.7975889788651165 - Acc_test: 0.7982857751277683 - Acc_val: 0.8121519474892074 Profundidad del árbol: 3. Acc_train: 0.800079263207084 - Acc_test: 0.7998296422487223 - Acc_val: 0.8142703535491326 Profundidad del árbol: 4. Acc_train: 0.8019070491017822 - Acc_test: 0.801692930153322 - Acc_val: 0.8146073726950298 Profundidad del árbol: 5. Acc_train: 0.802954033255056 - Acc_test: 0.8029173764906303 - Acc_val: 0.8141419653030765 Profundidad del árbol: 6. Acc_train: 0.8037111743376494 - Acc_test: 0.8031303236797275 - Acc_val: 0.814992537433198 Profundidad del árbol: 7. Acc_train: 0.8047522433262154 - Acc_test: 0.8048339011925043 - Acc_val: 0.8156505271942354 Profundidad del árbol: 8. Acc_train: 0.8059589369265987 - Acc_test: 0.804195059625213 - Acc_val: 0.8152011683330391 Profundidad del árbol: 9. Acc_train: 0.8067574841621465 - Acc_test: 0.803822402044293 - Acc_val: 0.8121198504276933 Profundidad del árbol: 10. Acc_train: 0.8071301395387355 - Acc_test: 0.8033432708688245 - Acc_val: 0.8114618606666559 Profundidad del árbol: 11. Acc_train: 0.8083486634685343 - Acc_test: 0.8045144804088586 - Acc_val: 0.8100335414292822 Profundidad del árbol: 12. Acc_train: 0.8087449795039543 - Acc_test: 0.8038756388415673 - Acc_val: 0.8096323281603569 Profundidad del árbol: 13. Acc_train: 0.8090407377393424 - Acc_test: 0.8037691652470187 - Acc_val: 0.8090866781146187 Profundidad del árbol: 14. Acc_train: 0.8090762287275889 - Acc_test: 0.8037159284497445 - Acc_val: 0.8091508722376466 Profundidad del árbol: 15. Acc_train: 0.8091590410334976 - Acc_test: 0.8037691652470187 - Acc_val: 0.8089422413378056 Profundidad del árbol: 16. Acc_train: 0.8092536836688218 - Acc_test: 0.8037159284497445 - Acc_val: 0.8088138530917495 Profundidad del árbol: 17. Acc_train: 0.8093305808100226 - Acc_test: 0.8037159284497445 - Acc_val: 0.8082842515767682
Los mejores resultados los obtenemos para una profundidad de 7, con un acc de 0.8156
for i in range(1, 18):
df_overfit = DecisionTreeClassifier(max_depth=i, random_state=42, min_samples_split = 500) # Instanciamos el modelo
df_overfit.fit(X_train, y_train) # Entrenamos el modelo
y_train_score = pd.DataFrame(df_overfit.predict_proba(X = X_train)[:,1], index = y_train.index, columns = ['DelayScore_dt'])
y_test_score = pd.DataFrame(df_overfit.predict_proba(X = X_test)[:,1], index = y_test.index, columns = ['DelayScore_dt'])
y_val_score = pd.DataFrame(df_overfit.predict_proba(X = val_df_X)[:,1], index = val_df_y.index, columns = ['DelayScore_dt'])
AUC_train = metrics.roc_auc_score(y_train, y_train_score)
AUC_test = metrics.roc_auc_score(y_test, y_test_score)
AUC_val = metrics.roc_auc_score(val_df_y, y_val_score)
print('Profundidad de árbol: {}. AUC_train: {} - AUC_test: {} - AUC_Val: {}'.format(i, AUC_train, AUC_test, AUC_val))
Profundidad de árbol: 1. AUC_train: 0.5689650676633251 - AUC_test: 0.5714936307962702 - AUC_Val: 0.5613778268373949 Profundidad de árbol: 2. AUC_train: 0.6343891976473679 - AUC_test: 0.6322272661575088 - AUC_Val: 0.6283040136940719 Profundidad de árbol: 3. AUC_train: 0.6596142441928153 - AUC_test: 0.6566235588017176 - AUC_Val: 0.651546789529985 Profundidad de árbol: 4. AUC_train: 0.6803856479158885 - AUC_test: 0.6762564032255843 - AUC_Val: 0.6713527909702178 Profundidad de árbol: 5. AUC_train: 0.6933739151011526 - AUC_test: 0.6868712978794775 - AUC_Val: 0.6804467244110811 Profundidad de árbol: 6. AUC_train: 0.7046516533783733 - AUC_test: 0.6981604545843509 - AUC_Val: 0.689106919687651 Profundidad de árbol: 7. AUC_train: 0.7132684825394768 - AUC_test: 0.7038337540554852 - AUC_Val: 0.6985050639964778 Profundidad de árbol: 8. AUC_train: 0.7205723982396101 - AUC_test: 0.7068106557715637 - AUC_Val: 0.7018907788043134 Profundidad de árbol: 9. AUC_train: 0.726789809517846 - AUC_test: 0.7109911497562211 - AUC_Val: 0.707622861131071 Profundidad de árbol: 10. AUC_train: 0.733263067892799 - AUC_test: 0.7151377359260395 - AUC_Val: 0.7039194602906105 Profundidad de árbol: 11. AUC_train: 0.7390195601213208 - AUC_test: 0.7178089520179077 - AUC_Val: 0.7016866819372841 Profundidad de árbol: 12. AUC_train: 0.7439132420580266 - AUC_test: 0.7184186929204722 - AUC_Val: 0.7007341198477631 Profundidad de árbol: 13. AUC_train: 0.748287939549002 - AUC_test: 0.7191318129555728 - AUC_Val: 0.69803182811776 Profundidad de árbol: 14. AUC_train: 0.7523334492441041 - AUC_test: 0.7176649493971902 - AUC_Val: 0.6964429100775656 Profundidad de árbol: 15. AUC_train: 0.7555121933451712 - AUC_test: 0.717959720996295 - AUC_Val: 0.6964854220591516 Profundidad de árbol: 16. AUC_train: 0.7578193721328269 - AUC_test: 0.7195227158284367 - AUC_Val: 0.695756475171957 Profundidad de árbol: 17. AUC_train: 0.7598237011841995 - AUC_test: 0.7186774110682237 - AUC_Val: 0.6943306389866815
Los mejores resultados los obtenemos para una profundidad de 9, con un AUC de 0.7076
# Para 50 estimadores y un min_samples_split de 500
for i in range(1, 20):
df_overfit = RandomForestClassifier(n_estimators = 50, criterion = 'gini', max_depth = i, min_samples_split = 500, bootstrap = True, random_state = 42) # Instancio modelo
df_overfit.fit(X = X_train, y = np.ravel(y_train)) # Entrenamos el modelo
y_train_score = pd.DataFrame(df_overfit.predict_proba(X = X_train)[:,1], index = y_train.index, columns=['DelayScore'])
y_test_score = pd.DataFrame(df_overfit.predict_proba(X = X_test)[:,1], index = y_test.index, columns=['DelayScore'])
y_val_score = pd.DataFrame(df_overfit.predict_proba(X = val_df_X)[:,1], index = val_df_y.index, columns=['DelayScore'])
AUC_train = metrics.roc_auc_score(y_train, y_train_score)
AUC_test = metrics.roc_auc_score(y_test, y_test_score)
AUC_val = metrics.roc_auc_score(val_df_y, y_val_score)
print('Profundidad de árbol: {}. AUC_train: {} - AUC_test: {} - AUC_Val: {}'.format(i, AUC_train, AUC_test, AUC_val))
Profundidad de árbol: 1. AUC_train: 0.6610362618665112 - AUC_test: 0.6565236937825283 - AUC_Val: 0.64656752916795 Profundidad de árbol: 2. AUC_train: 0.6882921625721049 - AUC_test: 0.6823545556213607 - AUC_Val: 0.6739843998835092 Profundidad de árbol: 3. AUC_train: 0.7000451299910659 - AUC_test: 0.6925571924058996 - AUC_Val: 0.6865814890872681 Profundidad de árbol: 4. AUC_train: 0.7069687517865323 - AUC_test: 0.6979005960392394 - AUC_Val: 0.6914805538428631 Profundidad de árbol: 5. AUC_train: 0.7099386358775781 - AUC_test: 0.6990825207579774 - AUC_Val: 0.6932695283148222 Profundidad de árbol: 6. AUC_train: 0.7171897052064472 - AUC_test: 0.7068490913862915 - AUC_Val: 0.699524022902654 Profundidad de árbol: 7. AUC_train: 0.7191667917469948 - AUC_test: 0.7077834907461822 - AUC_Val: 0.7002896394435263 Profundidad de árbol: 8. AUC_train: 0.7236083613947961 - AUC_test: 0.7119053598591449 - AUC_Val: 0.7042189476954116 Profundidad de árbol: 9. AUC_train: 0.7298809253275498 - AUC_test: 0.7167606311854413 - AUC_Val: 0.7073249877862414 Profundidad de árbol: 10. AUC_train: 0.7337284963416493 - AUC_test: 0.7203132815153668 - AUC_Val: 0.7083152587075981 Profundidad de árbol: 11. AUC_train: 0.7363070683564722 - AUC_test: 0.7228644010941394 - AUC_Val: 0.7116327910687038 Profundidad de árbol: 12. AUC_train: 0.7397284754900202 - AUC_test: 0.7226290061842482 - AUC_Val: 0.7127212334271289 Profundidad de árbol: 13. AUC_train: 0.7438179752337641 - AUC_test: 0.7260371048132457 - AUC_Val: 0.7130625156342241 Profundidad de árbol: 14. AUC_train: 0.745845498280208 - AUC_test: 0.7283894658032423 - AUC_Val: 0.7137022143559526 Profundidad de árbol: 15. AUC_train: 0.7489258673677804 - AUC_test: 0.7295128416767761 - AUC_Val: 0.7150385307908602 Profundidad de árbol: 16. AUC_train: 0.7510758659477412 - AUC_test: 0.7306534586689164 - AUC_Val: 0.7143310252877451 Profundidad de árbol: 17. AUC_train: 0.75587516031577 - AUC_test: 0.7331918226920322 - AUC_Val: 0.7165820965917953 Profundidad de árbol: 18. AUC_train: 0.7566645188609378 - AUC_test: 0.7337628238739062 - AUC_Val: 0.7159764289219264 Profundidad de árbol: 19. AUC_train: 0.7585416760341603 - AUC_test: 0.7344394342987093 - AUC_Val: 0.7157047594545038
Comparación entre el DecisionTree y el RandomForest
Comparamos el rendimiento de ambos modelos para una profundidad de árbol de 5, un mínimo de registros por split de 500 y un random state de 42.
Como vamos a utilizar la métrica AUC/ROC Curve, necesitamos que los modelos me de la predicción en probabilidad (predict_proba y no predict).
dt = DecisionTreeClassifier(
criterion='gini',
splitter='best',
max_depth=5,
min_samples_split=500,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features=None,
random_state=42,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
class_weight=None,
ccp_alpha=0.0,
)
dt.fit(
X = X_test,
y = y_test
)
DecisionTreeClassifier(max_depth=5, min_samples_split=500, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(max_depth=5, min_samples_split=500, random_state=42)
# Dataframe de los scorings de X_test
y_test_score_dt = pd.DataFrame(
dt.predict_proba(X = X_test)[:,1], # Probabilidad de obtener 1 para cada valor del X_test
index = y_test.index, # Le doy el mismo índice que el de y_test
columns = ['DelayScore_dt']) # Le doy un nombre a la columna
y_test_score_dt.head()
| DelayScore_dt | |
|---|---|
| 5 | 0.10 |
| 15 | 0.15 |
| 26 | 0.36 |
| 28 | 0.10 |
| 36 | 0.15 |
rf = RandomForestClassifier(
n_estimators = 50,
max_depth = 5,
min_samples_split=500,
random_state=42
)
rf.fit(
X_train,
np.ravel(y_train)
)
RandomForestClassifier(max_depth=5, min_samples_split=500, n_estimators=50,
random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestClassifier(max_depth=5, min_samples_split=500, n_estimators=50,
random_state=42)y_test_score_rf = pd.DataFrame(
rf.predict_proba(X_test)[:,1],
index = y_test.index,
columns = ['DelayScore_rf'])
y_test_score_rf.head()
| DelayScore_rf | |
|---|---|
| 5 | 0.17 |
| 15 | 0.19 |
| 26 | 0.20 |
| 28 | 0.18 |
| 36 | 0.21 |
Hacemos una tabla de resultados en el que aparece el target real y las predicciones en probabilidad del DecisionTree y del RandomForest
results_df_= y_test.join(y_test_score_dt)
results_df_= results_df_.join(y_test_score_rf)
results_df_.sample(20)
| ARR_DEL15 | DelayScore_dt | DelayScore_rf | |
|---|---|---|---|
| 216207 | 0.00 | 0.25 | 0.22 |
| 103205 | 0.00 | 0.34 | 0.27 |
| 233439 | 0.00 | 0.23 | 0.24 |
| 4820 | 0.00 | 0.16 | 0.20 |
| 45206 | 0.00 | 0.16 | 0.19 |
| 134526 | 0.00 | 0.15 | 0.20 |
| 88786 | 0.00 | 0.15 | 0.15 |
| 196809 | 0.00 | 0.22 | 0.22 |
| 181549 | 0.00 | 0.15 | 0.17 |
| 222461 | 1.00 | 0.15 | 0.20 |
| 196604 | 0.00 | 0.23 | 0.22 |
| 123265 | 1.00 | 0.78 | 0.35 |
| 188288 | 0.00 | 0.10 | 0.18 |
| 45659 | 0.00 | 0.15 | 0.21 |
| 24334 | 1.00 | 0.91 | 0.32 |
| 42487 | 0.00 | 0.15 | 0.20 |
| 73028 | 0.00 | 0.10 | 0.19 |
| 43973 | 0.00 | 0.16 | 0.19 |
| 62603 | 0.00 | 0.10 | 0.19 |
| 25612 | 1.00 | 0.40 | 0.32 |
Determinamos la ROC Curve y el AUC
# AUC del DecisionTree
print(metrics.roc_auc_score(
results_df_['ARR_DEL15'], # y_test
results_df_['DelayScore_dt'] # y_test_score_dt
)
)
0.6931513901148161
# AUC del RandomForest
print(metrics.roc_auc_score(
results_df_['ARR_DEL15'], # y_test
results_df_['DelayScore_rf'] # y_test_score_rf
)
)
0.6990825207579774
fpr_dt, tpr_dt, th_dt = metrics.roc_curve(results_df_['ARR_DEL15'], results_df_['DelayScore_dt'])
fpr_rf, tpr_rf, th_rf = metrics.roc_curve(results_df_['ARR_DEL15'], results_df_['DelayScore_rf'])
# Visualizamos la ROC Curve de ambos modelos
plt.clf()
plt.plot(fpr_dt, tpr_dt, label = 'DecisionTree')
plt.plot(fpr_rf, tpr_rf, label = 'RandomForest')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.legend()
plt.show()
Métrica del Gain Store y Lift Store
Un aspecto importante a tener en cuenta es que el valor de score obtenido es un valor CONTINUO, por lo que deberemos trabajar con deciles.
Los deciles son números que dividen los datos en 10 grupos con la misma frecuencia. Cada grupo vendrá dado por un intervalo de probabilidades
El primer decil es el punto donde el 10% de todos los valores de datos se encuentran por debajo de él. El segundo decil es el punto donde el 20% de todos los valores de datos se encuentran por debajo de él, y así sucesivamente.
Pasos a seguir para determinar las métricas Gain y Lift:
# Número de valores diferentes
y_test_score_rf.nunique()
DelayScore_rf 14331 dtype: int64
Para determinar los deciles usamos la función qcut de Pandas
pd.qcut?
Lo hacemos primero con los datos reales, para luego hacer una función con todos los pasos y pasarle el X_test
results_df_ = results_df_.sort_values(by='DelayScore_rf', ascending = False)
# Asignamos cada probabilidad del RandomForest a un decil
results_df_['Decile'] = pd.qcut(
x = results_df_['DelayScore_rf'], # El input debe ser de una dimensión
q = 10, # números de deciles
labels = None,
retbins = False,
precision = 3,
duplicates = 'raise'
)
# Resumen de resultados por intervalos de probabilidad (deciles)
results_summary_table = results_df_.pivot_table(
index = 'Decile',
values = target,
aggfunc = [len, np.sum, np.mean]
)
results_summary_table
# Renombramos las columnas
results_summary_table.columns = ['Nº Vuelos', 'Nº Vuelos con Retraso', 'Accuracy']
results_summary_table['Accuracy'] = results_summary_table['Accuracy']*100
results_summary_table['Porcentaje Nº Vuelos'] = (results_summary_table['Nº Vuelos'] / results_summary_table['Nº Vuelos'].sum())*100
results_summary_table['Porcentaje Nº Vuelos con Retraso'] = (results_summary_table['Nº Vuelos con Retraso'] / results_summary_table['Nº Vuelos con Retraso'].sum())*100
results_summary_table['Nº Vuelos Acumulado'] = results_summary_table['Nº Vuelos'].cumsum()
results_summary_table['Nº Vuelos con Retraso Acumulados'] = results_summary_table['Nº Vuelos con Retraso'].cumsum()
results_summary_table['Porcentaje Nº Vuelos Acumulado'] = results_summary_table['Porcentaje Nº Vuelos'].cumsum()
results_summary_table['Porcentaje Nº Vuelos con Retraso Acumulado'] = results_summary_table['Porcentaje Nº Vuelos con Retraso'].cumsum()
results_summary_table['Cumulative Lift'] = results_summary_table['Porcentaje Nº Vuelos con Retraso Acumulado']/results_summary_table['Porcentaje Nº Vuelos Acumulado']
results_summary_table
| Nº Vuelos | Nº Vuelos con Retraso | Accuracy | Porcentaje Nº Vuelos | Porcentaje Nº Vuelos con Retraso | Nº Vuelos Acumulado | Nº Vuelos con Retraso Acumulados | Porcentaje Nº Vuelos Acumulado | Porcentaje Nº Vuelos con Retraso Acumulado | Cumulative Lift | |
|---|---|---|---|---|---|---|---|---|---|---|
| Decile | ||||||||||
| (0.124, 0.173] | 1879 | 156.00 | 8.30 | 10.00 | 3.90 | 1879 | 156.00 | 10.00 | 3.90 | 0.39 |
| (0.173, 0.184] | 1878 | 191.00 | 10.17 | 10.00 | 4.77 | 3757 | 347.00 | 20.00 | 8.66 | 0.43 |
| (0.184, 0.19] | 1878 | 235.00 | 12.51 | 10.00 | 5.87 | 5635 | 582.00 | 30.00 | 14.53 | 0.48 |
| (0.19, 0.198] | 1879 | 265.00 | 14.10 | 10.00 | 6.62 | 7514 | 847.00 | 40.00 | 21.15 | 0.53 |
| (0.198, 0.205] | 1878 | 313.00 | 16.67 | 10.00 | 7.82 | 9392 | 1,160.00 | 50.00 | 28.96 | 0.58 |
| (0.205, 0.214] | 1878 | 350.00 | 18.64 | 10.00 | 8.74 | 11270 | 1,510.00 | 60.00 | 37.70 | 0.63 |
| (0.214, 0.226] | 1879 | 376.00 | 20.01 | 10.00 | 9.39 | 13149 | 1,886.00 | 70.00 | 47.09 | 0.67 |
| (0.226, 0.241] | 1878 | 500.00 | 26.62 | 10.00 | 12.48 | 15027 | 2,386.00 | 80.00 | 59.58 | 0.74 |
| (0.241, 0.266] | 1878 | 610.00 | 32.48 | 10.00 | 15.23 | 16905 | 2,996.00 | 90.00 | 74.81 | 0.83 |
| (0.266, 0.455] | 1879 | 1,009.00 | 53.70 | 10.00 | 25.19 | 18784 | 4,005.00 | 100.00 | 100.00 | 1.00 |
Definimos una función que genere esta tabla reumen de resultados (tabla de ganancias) para un caso genérico. En donde:
# Defino una función que haga todos los pasos anteriores
def get_gain_table(model, X_test, y_test):
y_score = pd.DataFrame(model.predict_proba(X_test)[:,1], index=y_test.index, columns=['Score'])
results_df = y_test.join(y_score)
results_df = results_df.sort_values(by='Score', ascending = False)
results_df['Decile'] = pd.qcut(results_df['Score'], q=10)
results_summary_table = results_df.pivot_table(index='Decile', values=y_test.columns, aggfunc=[len, np.sum, np.mean]).sort_index(ascending=False)
results_summary_table.columns = ['Cases', 'Responses', 'Accuracy']
results_summary_table['Accuracy'] = results_summary_table['Accuracy']*100
results_summary_table['Pct. Cases'] = (results_summary_table['Cases'] / results_summary_table['Cases'].sum())*100
results_summary_table['Pct. Responses'] = (results_summary_table['Responses'] / results_summary_table['Responses'].sum())*100
results_summary_table['Cum. Cases'] = results_summary_table['Cases'].cumsum()
results_summary_table['Cum. Responses'] = results_summary_table['Responses'].cumsum()
results_summary_table['Cum. Pct. Cases'] = results_summary_table['Pct. Cases'].cumsum()
results_summary_table['Cum. Pct. Responses'] = results_summary_table['Pct. Responses'].cumsum()
results_summary_table['Cumulative Lift'] = results_summary_table['Cum. Pct. Responses']/results_summary_table['Cum. Pct. Cases']
return results_summary_table
# Tabla de ganancias del RandomForest
rf_results_summary_table = get_gain_table(rf, X_test, y_test)
rf_results_summary_table
| Cases | Responses | Accuracy | Pct. Cases | Pct. Responses | Cum. Cases | Cum. Responses | Cum. Pct. Cases | Cum. Pct. Responses | Cumulative Lift | |
|---|---|---|---|---|---|---|---|---|---|---|
| Decile | ||||||||||
| (0.266, 0.455] | 1879 | 1,009.00 | 53.70 | 10.00 | 25.19 | 1879 | 1,009.00 | 10.00 | 25.19 | 2.52 |
| (0.241, 0.266] | 1878 | 610.00 | 32.48 | 10.00 | 15.23 | 3757 | 1,619.00 | 20.00 | 40.42 | 2.02 |
| (0.226, 0.241] | 1878 | 500.00 | 26.62 | 10.00 | 12.48 | 5635 | 2,119.00 | 30.00 | 52.91 | 1.76 |
| (0.214, 0.226] | 1879 | 376.00 | 20.01 | 10.00 | 9.39 | 7514 | 2,495.00 | 40.00 | 62.30 | 1.56 |
| (0.205, 0.214] | 1878 | 350.00 | 18.64 | 10.00 | 8.74 | 9392 | 2,845.00 | 50.00 | 71.04 | 1.42 |
| (0.198, 0.205] | 1878 | 313.00 | 16.67 | 10.00 | 7.82 | 11270 | 3,158.00 | 60.00 | 78.85 | 1.31 |
| (0.19, 0.198] | 1879 | 265.00 | 14.10 | 10.00 | 6.62 | 13149 | 3,423.00 | 70.00 | 85.47 | 1.22 |
| (0.184, 0.19] | 1878 | 235.00 | 12.51 | 10.00 | 5.87 | 15027 | 3,658.00 | 80.00 | 91.34 | 1.14 |
| (0.173, 0.184] | 1878 | 191.00 | 10.17 | 10.00 | 4.77 | 16905 | 3,849.00 | 90.00 | 96.10 | 1.07 |
| (0.124, 0.173] | 1879 | 156.00 | 8.30 | 10.00 | 3.90 | 18784 | 4,005.00 | 100.00 | 100.00 | 1.00 |
rf_results_summary_table.index = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
rf_results_summary_table
| Cases | Responses | Accuracy | Pct. Cases | Pct. Responses | Cum. Cases | Cum. Responses | Cum. Pct. Cases | Cum. Pct. Responses | Cumulative Lift | |
|---|---|---|---|---|---|---|---|---|---|---|
| 10 | 1879 | 1,009.00 | 53.70 | 10.00 | 25.19 | 1879 | 1,009.00 | 10.00 | 25.19 | 2.52 |
| 20 | 1878 | 610.00 | 32.48 | 10.00 | 15.23 | 3757 | 1,619.00 | 20.00 | 40.42 | 2.02 |
| 30 | 1878 | 500.00 | 26.62 | 10.00 | 12.48 | 5635 | 2,119.00 | 30.00 | 52.91 | 1.76 |
| 40 | 1879 | 376.00 | 20.01 | 10.00 | 9.39 | 7514 | 2,495.00 | 40.00 | 62.30 | 1.56 |
| 50 | 1878 | 350.00 | 18.64 | 10.00 | 8.74 | 9392 | 2,845.00 | 50.00 | 71.04 | 1.42 |
| 60 | 1878 | 313.00 | 16.67 | 10.00 | 7.82 | 11270 | 3,158.00 | 60.00 | 78.85 | 1.31 |
| 70 | 1879 | 265.00 | 14.10 | 10.00 | 6.62 | 13149 | 3,423.00 | 70.00 | 85.47 | 1.22 |
| 80 | 1878 | 235.00 | 12.51 | 10.00 | 5.87 | 15027 | 3,658.00 | 80.00 | 91.34 | 1.14 |
| 90 | 1878 | 191.00 | 10.17 | 10.00 | 4.77 | 16905 | 3,849.00 | 90.00 | 96.10 | 1.07 |
| 100 | 1879 | 156.00 | 8.30 | 10.00 | 3.90 | 18784 | 4,005.00 | 100.00 | 100.00 | 1.00 |
El Gain Store para un decil determinado, es la relación entre el número acumulado de responses hasta ese decil y el número total de responses que hay en todo el conjunto de datos. Interpretación:
El lift mide cuánto mejor se puede hacer con un modelo en comparación sin un modelo (o modelo tonto). Es la relación entre el % de gancia y el porcentaje de expectativa aleatoria en un nivel de decil dado. La expectativa aleatoria en el x-esimo decil es x%. Interpretación:
# Visualizamos la Gain Score
plt.figure(figsize=(15,5))
rf_results_summary_table['Cum. Pct. Responses'].plot(kind = 'line', xlabel = '% Cases', ylabel = '% Responses')
plt.title('Gain Score');
# Visualizamos la Lift Score
plt.figure(figsize=(15,5))
rf_results_summary_table['Cumulative Lift'].plot(kind = 'line', xlabel = '% Cases', ylabel = 'Lift')
plt.title('Lift');
El Gradient Boosting es un algoritmo basado en el ensamble de modelos debiles para obtener un modelo más robusto, de forma que cada modelo se genera sobre el anterior con la intención de correjir el error producido por este. Para ello, se utiliza un vector de pesos asociados a las distintas observaciones del dataset, inicializado con pesos uniformes (inicialmente tenemos una réplica del dataset balanceada).
A medida que vamos avanzando, en cada iteracción, se incrementan los pesos de los errores (observaciones mal clasificadas) para que tengan mayor relevancia en la construcción de futuros modelos, y se disminuyen los pesos de los aciertos (observaciones bien clasificadas).
Finalmente se combinan las predicciones de todos los modelos generados, mediante Voting.
En el GBM (Gradient Boost Machine) habitualmente se utilizan como estimadores base los DecisionTree y como algoritmo de optimización (para mejorar la precisión) el método Gradient Descent (Descenso del Gradiente).
Implementaciones más usadas:
Image('pictures/gradient_boosting.jpg')
¿Cómo ajustar los pesos de forma óptima?
Para ellos se utiliza una función llamada Loss Function (función de coste) como el logloss y se utiliza un algoritmo de optimización llamado Gradient Descent (descenso del gradiente), que consiste en hacer derivadas parciales de la función de coste para decidir la dirección de avance.
Image('pictures/loss_function_GBM.jpg')
Para decidir el número de iteracciones que hay que hacer, se suele buscar el cambio de pendiente más brusco en la Lost Function, para evitar mínimos relativos.
Utilizamos el Gradient Boosting de sklearn
GradientBoostingClassifier??
# Instancio el modelo
gb = GradientBoostingClassifier(
loss = 'log_loss', # Loss Function que utilizamos (es una buena opción para clasificación con salidad probabilísticas)
learning_rate = 0.1, # parámetro que podemos variar para mejorar el rendimiento (son bajos)
n_estimators = 100,
subsample = 1.0,
criterion = 'friedman_mse',
min_samples_split = 500,
min_samples_leaf = 1,
min_weight_fraction_leaf = 0.0,
max_depth = 5,
min_impurity_decrease = 0.0,
init = None,
random_state = 42,
max_features = None,
verbose = 0,
max_leaf_nodes = None,
warm_start = False,
validation_fraction = 0.1,
n_iter_no_change = None,
tol = 0.0001,
ccp_alpha = 0.0
)
gb
# Entreno el modelo
gb.fit(
X = X_train,
y = np.ravel(y_train) # Tengo que dárselo en 1D
)
GradientBoostingClassifier(max_depth=5, min_samples_split=500, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingClassifier(max_depth=5, min_samples_split=500, random_state=42)
# Predicción de probabilidades
y_test_score_gb = pd.DataFrame(
gb.predict_proba(X_test)[:,1],
index = y_test.index,
columns = ['DelayScore_gb'])
y_test_score_gb
| DelayScore_gb | |
|---|---|
| 5 | 0.09 |
| 15 | 0.16 |
| 26 | 0.34 |
| 28 | 0.19 |
| 36 | 0.13 |
| ... | ... |
| 250090 | 0.16 |
| 250100 | 0.14 |
| 250110 | 0.04 |
| 250123 | 0.57 |
| 250132 | 0.23 |
18784 rows × 1 columns
y_test_score_gb.nunique()
DelayScore_gb 18695 dtype: int64
# Tabla de resultados
results_gb = y_test.join(y_test_score_gb)
results_gb
| ARR_DEL15 | DelayScore_gb | |
|---|---|---|
| 5 | 0.00 | 0.09 |
| 15 | 0.00 | 0.16 |
| 26 | 0.00 | 0.34 |
| 28 | 1.00 | 0.19 |
| 36 | 1.00 | 0.13 |
| ... | ... | ... |
| 250090 | 0.00 | 0.16 |
| 250100 | 0.00 | 0.14 |
| 250110 | 0.00 | 0.04 |
| 250123 | 0.00 | 0.57 |
| 250132 | 1.00 | 0.23 |
18784 rows × 2 columns
Validación con la métrica AUC/ROC Curve
# AUC
print(metrics.roc_auc_score(
y_test,
y_test_score_gb))
0.7602153458795626
Tabla de ganancias
gb_results_summary_table = get_gain_table(
model = gb,
X_test = X_test,
y_test = y_test
)
gb_results_summary_table
| Cases | Responses | Accuracy | Pct. Cases | Pct. Responses | Cum. Cases | Cum. Responses | Cum. Pct. Cases | Cum. Pct. Responses | Cumulative Lift | |
|---|---|---|---|---|---|---|---|---|---|---|
| Decile | ||||||||||
| (0.416, 0.989] | 1879 | 1,233.00 | 65.62 | 10.00 | 30.79 | 1879 | 1,233.00 | 10.00 | 30.79 | 3.08 |
| (0.294, 0.416] | 1878 | 675.00 | 35.94 | 10.00 | 16.85 | 3757 | 1,908.00 | 20.00 | 47.64 | 2.38 |
| (0.235, 0.294] | 1878 | 499.00 | 26.57 | 10.00 | 12.46 | 5635 | 2,407.00 | 30.00 | 60.10 | 2.00 |
| (0.197, 0.235] | 1879 | 392.00 | 20.86 | 10.00 | 9.79 | 7514 | 2,799.00 | 40.00 | 69.89 | 1.75 |
| (0.166, 0.197] | 1878 | 302.00 | 16.08 | 10.00 | 7.54 | 9392 | 3,101.00 | 50.00 | 77.43 | 1.55 |
| (0.141, 0.166] | 1878 | 270.00 | 14.38 | 10.00 | 6.74 | 11270 | 3,371.00 | 60.00 | 84.17 | 1.40 |
| (0.119, 0.141] | 1879 | 217.00 | 11.55 | 10.00 | 5.42 | 13149 | 3,588.00 | 70.00 | 89.59 | 1.28 |
| (0.0982, 0.119] | 1878 | 179.00 | 9.53 | 10.00 | 4.47 | 15027 | 3,767.00 | 80.00 | 94.06 | 1.18 |
| (0.078, 0.0982] | 1878 | 144.00 | 7.67 | 10.00 | 3.60 | 16905 | 3,911.00 | 90.00 | 97.65 | 1.09 |
| (0.0194, 0.078] | 1879 | 94.00 | 5.00 | 10.00 | 2.35 | 18784 | 4,005.00 | 100.00 | 100.00 | 1.00 |
Feature Importances
# Atributos más importantes
gb_feature_importances = pd.Series(
gb.feature_importances_,
index = X_train.columns
).sort_values(ascending = False).head(20)
gb_feature_importances
TAXI_OUT 0.43 WEEK 0.16 CRS_DEP_TIME 0.10 CRS_ARR_TIME 0.08 WEEKDAY 0.06 OP_CARRIER_AIRLINE_ID_20409.0 0.04 CRS_ELAPSED_TIME 0.03 YEAR 0.02 MONTH 0.01 WEEKEND 0.01 ORIGIN_STATE_NM_Florida 0.01 OP_CARRIER_AIRLINE_ID_20363.0 0.01 OP_CARRIER_AIRLINE_ID_19790.0 0.01 TAIL_NUM_COUNT 0.00 ORIGIN_STATE_NM_Virginia 0.00 ORIGIN_DCA 0.00 ORIGIN_CLT 0.00 ORIGIN_CITY_NAME_Charlotte, NC 0.00 ORIGIN_STATE_NM_Texas 0.00 ORIGIN_CITY_NAME_San Jose, CA 0.00 dtype: float64
# Visualizamos los atributos importantes
plt.figure(figsize=(15,5))
gb_feature_importances.plot(kind = 'bar', ylabel = 'Coeficientes de ponderación')
plt.title('Feature Importances');
Comparamos los tres modelos vistos:
# Parámetros que usamos
RANDOM_STATE = 42
n_estimators = 50
max_depth = 5
# Definición de los modelos
models = [
('DecisionTree', DecisionTreeClassifier(max_depth = max_depth, random_state = RANDOM_STATE)),
('RandomForest', RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth, random_state = RANDOM_STATE)),
('GradientBoosting', GradientBoostingClassifier(n_estimators = n_estimators, max_depth = max_depth, random_state = RANDOM_STATE))
]
for model in models:
print (model)
('DecisionTree', DecisionTreeClassifier(max_depth=5, random_state=42))
('RandomForest', RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42))
('GradientBoosting', GradientBoostingClassifier(max_depth=5, n_estimators=50, random_state=42))
# Entrenamiento y validación con AUC/Roc Curve de los modelos
plt.clf() # Inicializamos la gráfica (borramos su contenido)
for model in models:
model_name = model[0] # Nombre del modelo
model_instance = model[1] # Instanciamos el modelo
model_instance.fit(X_train, np.ravel(y_train)) # Entrenamos el modelo (le damos el y_train en 1D)
predictions = model_instance.predict_proba(X_test)[:,1] # Obtenemos las predicciones de probabilidad
auc_score = metrics.roc_auc_score(y_test, predictions) # Obtenemos el AUC
print('ROC AUC Score for {}: {}'.format(model_name, auc_score))
fpr, tpr, threshold = metrics.roc_curve(y_test, predictions) # False Positive Rate, True Positive Rate (ejes de la ROC Curve) y threshold del modelo
plt.plot(fpr, tpr, label = 'ROC Curve for {} - Area: {:2f}'.format(model_name, auc_score)) # Dibujamos la ROC Curve y le ponemos etiqueta
plt.plot([0, 1], [0, 1], color = 'gray', linestyle='--') # Estimador Random
plt.xlabel('FPR') # Etiqueta eje x
plt.ylabel('TPR') # Etiqueta eje y
plt.legend(loc="lower right") # Leyenda con ubicación abajo a la derecha
plt.title('ROC curve') # Título
plt.show()
ROC AUC Score for DecisionTree: 0.6868521223090529 ROC AUC Score for RandomForest: 0.699962485150548 ROC AUC Score for GradientBoosting: 0.7428600692736489
Podemos apreciar que el modelo que peor rendimiento tiene es el DecisionTree y el que mejor rendimiento tiene es el GradientBoosting
Cada vez que se hace un K-Fold, se va a obtener un resultado diferente --> Lo que se se hace es repetirlo "n" veces
# repite el kfolk 5 veces con 5 iteracciones cada kfolk
cv = RepeatedKFold(n_splits=10, n_repeats=15, random_state=None)
Vamos a ver como podemos utilizar sklearn para probar diferentes hiperparametros para un modelo dentro de un K-Folk Cross Validation. Lo vamos a hacer para un DecisionTreeClassifier
Defino los hiperparámetros
# diccionario con los parametros y estrategias de imputación para las variables categóricas
param_grid = {
"criterion": ["gini", "entropy"],
"max_depth": np.arange(2, 7), # [2, 3, 4, 5, 6]
"min_samples_split": [3, 5, 10, 15, 20]
}
1.- RandomizedSearchCV
explora sólo algunas combinaciones de nuestro param_grid
RandomizedSearchCV??
st = time.time()
# Lo invoco
rand_cv = RandomizedSearchCV(
estimator = DecisionTreeClassifier(), # le paso el modelo
param_distributions = param_grid, # le paso el diccionario de imputaciones e hiperparametros
cv = cv, # le paso el número de veces que repite el kfolk cross validation
return_train_score = True, # incluye los scores del train (para tener en cuenta el overfitting o underfitting)
verbose = False,
n_jobs = -1 # para usar todos los procesadores
)
# Lo entreno pasandole el train
rand_cv.fit(X_train, y_train)
score_rand = rand_cv.score(X_train, y_train)
et = time.time()
tt_rand = round((et - st)/60, 2)
print("Total time took: ", str(tt_rand), " minutes!")
print(f'El score del mejor estimador es: {round(score_rand,2)}')
Total time took: 8.3 minutes! El score del mejor estimador es: 0.8
rand_cv.cv_results_
{'mean_fit_time': array([1.88542116, 1.4212946 , 1.81299684, 2.95811394, 2.69706007,
2.98572977, 2.39465582, 2.61451053, 1.55545994, 1.55276469]),
'std_fit_time': array([0.7189075 , 0.15437755, 0.17377677, 0.24360429, 0.29345608,
0.31703923, 0.30433338, 0.26943386, 0.19518833, 0.19599377]),
'mean_score_time': array([0.07339485, 0.04811997, 0.04679912, 0.04628102, 0.04775553,
0.04662682, 0.05052947, 0.04737527, 0.05145327, 0.05186618]),
'std_score_time': array([0.11496697, 0.01088803, 0.00914093, 0.01118377, 0.01182615,
0.01062603, 0.01452289, 0.01048981, 0.01271328, 0.01349413]),
'param_min_samples_split': masked_array(data=[15, 20, 10, 20, 20, 15, 10, 10, 3, 10],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object),
'param_max_depth': masked_array(data=[2, 2, 3, 6, 5, 6, 4, 5, 2, 2],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object),
'param_criterion': masked_array(data=['entropy', 'entropy', 'entropy', 'entropy', 'gini',
'entropy', 'gini', 'gini', 'entropy', 'gini'],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object),
'params': [{'min_samples_split': 15, 'max_depth': 2, 'criterion': 'entropy'},
{'min_samples_split': 20, 'max_depth': 2, 'criterion': 'entropy'},
{'min_samples_split': 10, 'max_depth': 3, 'criterion': 'entropy'},
{'min_samples_split': 20, 'max_depth': 6, 'criterion': 'entropy'},
{'min_samples_split': 20, 'max_depth': 5, 'criterion': 'gini'},
{'min_samples_split': 15, 'max_depth': 6, 'criterion': 'entropy'},
{'min_samples_split': 10, 'max_depth': 4, 'criterion': 'gini'},
{'min_samples_split': 10, 'max_depth': 5, 'criterion': 'gini'},
{'min_samples_split': 3, 'max_depth': 2, 'criterion': 'entropy'},
{'min_samples_split': 10, 'max_depth': 2, 'criterion': 'gini'}],
'split0_test_score': array([0.79232225, 0.79232225, 0.79545723, 0.79888797, 0.79924287,
0.79888797, 0.79853307, 0.79924287, 0.79232225, 0.79427422]),
'split1_test_score': array([0.79244055, 0.79244055, 0.79468828, 0.79711345, 0.79829646,
0.79711345, 0.79835561, 0.79829646, 0.79244055, 0.79386017]),
'split2_test_score': array([0.80030758, 0.80030758, 0.80030758, 0.8066367 , 0.80645925,
0.8066367 , 0.80628179, 0.80645925, 0.80030758, 0.80030758]),
'split3_test_score': array([0.79752751, 0.79752751, 0.79794156, 0.80154975, 0.80018928,
0.80154975, 0.79995268, 0.80018928, 0.79752751, 0.79817816]),
'split4_test_score': array([0.79533893, 0.79533893, 0.79817816, 0.80344256, 0.80154975,
0.80344256, 0.8017272 , 0.80154975, 0.79533893, 0.79622619]),
'split5_test_score': array([0.79788241, 0.79788241, 0.79906542, 0.80362002, 0.80095824,
0.80362002, 0.80137229, 0.80095824, 0.79788241, 0.79788241]),
'split6_test_score': array([0.79882882, 0.79882882, 0.80279191, 0.80598604, 0.80563114,
0.80598604, 0.80545369, 0.80563114, 0.79882882, 0.80066249]),
'split7_test_score': array([0.79053534, 0.79053534, 0.79479444, 0.79787045, 0.79792961,
0.79787045, 0.79574091, 0.79792961, 0.79053534, 0.79053534]),
'split8_test_score': array([0.800769 , 0.800769 , 0.800769 , 0.80520556, 0.80638864,
0.80520556, 0.80396332, 0.80638864, 0.800769 , 0.80035492]),
'split9_test_score': array([0.79751553, 0.79751553, 0.80177462, 0.8050281 , 0.80425909,
0.8050281 , 0.80419994, 0.80425909, 0.79751553, 0.79976338]),
'split10_test_score': array([0.79320951, 0.79320951, 0.7966994 , 0.80007098, 0.79859222,
0.80007098, 0.79829646, 0.79859222, 0.79320951, 0.79468828]),
'split11_test_score': array([0.79161245, 0.79161245, 0.79699515, 0.79971608, 0.79936117,
0.79971608, 0.7966994 , 0.79936117, 0.79161245, 0.79386017]),
'split12_test_score': array([0.79587129, 0.79587129, 0.79817816, 0.79977523, 0.80225955,
0.79977523, 0.80036673, 0.80225955, 0.79587129, 0.79587129]),
'split13_test_score': array([0.79516148, 0.79516148, 0.80048503, 0.80131314, 0.80024843,
0.80131314, 0.80078079, 0.80024843, 0.79516148, 0.79527978]),
'split14_test_score': array([0.79924287, 0.79924287, 0.80131314, 0.80533538, 0.80415237,
0.80533538, 0.80415237, 0.80415237, 0.79924287, 0.79924287]),
'split15_test_score': array([0.79208565, 0.79208565, 0.79569384, 0.79711345, 0.79805986,
0.79711345, 0.79764581, 0.79805986, 0.79208565, 0.79403762]),
'split16_test_score': array([0.79752751, 0.79752751, 0.79859222, 0.80013013, 0.80137229,
0.80013013, 0.80078079, 0.80137229, 0.79752751, 0.7970543 ]),
'split17_test_score': array([0.79893523, 0.79893523, 0.79893523, 0.8043774 , 0.80538302,
0.8043774 , 0.80319432, 0.80538302, 0.79893523, 0.79893523]),
'split18_test_score': array([0.79810707, 0.79810707, 0.80183378, 0.80508725, 0.80319432,
0.80508725, 0.80266193, 0.80319432, 0.79810707, 0.79982254]),
'split19_test_score': array([0.79893523, 0.79893523, 0.80047323, 0.8029577 , 0.80301686,
0.8029577 , 0.80266193, 0.80301686, 0.79893523, 0.79893523]),
'split20_test_score': array([0.79782326, 0.79782326, 0.79900627, 0.80285106, 0.80083994,
0.80285106, 0.80137229, 0.80083994, 0.79782326, 0.79906542]),
'split21_test_score': array([0.79380102, 0.79380102, 0.79723175, 0.80013013, 0.79900627,
0.80013013, 0.79865137, 0.79900627, 0.79380102, 0.79480658]),
'split22_test_score': array([0.79989353, 0.79989353, 0.80237785, 0.80616349, 0.80592689,
0.80616349, 0.80622264, 0.80592689, 0.79989353, 0.80089909]),
'split23_test_score': array([0.80119484, 0.80119484, 0.80225955, 0.8065184 , 0.80415237,
0.8065184 , 0.80492133, 0.80415237, 0.80119484, 0.79983438]),
'split24_test_score': array([0.79054773, 0.79054773, 0.7965811 , 0.79841476, 0.79853307,
0.79841476, 0.79646279, 0.79853307, 0.79054773, 0.79232225]),
'split25_test_score': array([0.79995268, 0.79995268, 0.80681415, 0.80870697, 0.80835206,
0.80870697, 0.80450728, 0.80835206, 0.79995268, 0.8018455 ]),
'split26_test_score': array([0.79835561, 0.79835561, 0.80001183, 0.80308766, 0.80367917,
0.80308766, 0.8020821 , 0.80367917, 0.79835561, 0.79947947]),
'split27_test_score': array([0.78769595, 0.78769595, 0.79313812, 0.79650991, 0.79544513,
0.79650991, 0.79467613, 0.79544513, 0.78769595, 0.78935226]),
'split28_test_score': array([0.79597752, 0.79597752, 0.79597752, 0.80195209, 0.80041408,
0.80195209, 0.80011831, 0.80041408, 0.79597752, 0.79597752]),
'split29_test_score': array([0.79443951, 0.79443951, 0.79615498, 0.799231 , 0.799231 ,
0.799231 , 0.79952677, 0.799231 , 0.79443951, 0.7963916 ]),
'split30_test_score': array([0.79486573, 0.79486573, 0.79770496, 0.79977523, 0.79953863,
0.79977523, 0.79912457, 0.79953863, 0.79486573, 0.79587129]),
'split31_test_score': array([0.79486573, 0.79486573, 0.79675855, 0.80113569, 0.80042588,
0.80113569, 0.79959778, 0.80042588, 0.79486573, 0.79622619]),
'split32_test_score': array([0.80048503, 0.80048503, 0.80468473, 0.80746481, 0.80616349,
0.80746481, 0.80616349, 0.80616349, 0.80048503, 0.80202295]),
'split33_test_score': array([0.79309121, 0.79309121, 0.79829646, 0.80007098, 0.80060334,
0.80007098, 0.79983438, 0.80060334, 0.79309121, 0.79533893]),
'split34_test_score': array([0.80030758, 0.80030758, 0.80432982, 0.80693245, 0.80770141,
0.80693245, 0.80740565, 0.80770141, 0.80030758, 0.80190465]),
'split35_test_score': array([0.79510233, 0.79510233, 0.79456998, 0.79977523, 0.79805986,
0.79977523, 0.79847391, 0.79805986, 0.79510233, 0.7921448 ]),
'split36_test_score': array([0.79752751, 0.79752751, 0.79983438, 0.80350172, 0.80308766,
0.80350172, 0.80125399, 0.80308766, 0.79752751, 0.79989353]),
'split37_test_score': array([0.79396628, 0.79396628, 0.79763384, 0.800769 , 0.79887607,
0.800769 , 0.79976338, 0.79887607, 0.79396628, 0.79485359]),
'split38_test_score': array([0.79727891, 0.79727891, 0.80047323, 0.80242532, 0.8028394 ,
0.80242532, 0.80165631, 0.8028394 , 0.79727891, 0.79775214]),
'split39_test_score': array([0.79242828, 0.79242828, 0.7970423 , 0.80195209, 0.80059154,
0.80195209, 0.79863946, 0.80053239, 0.79242828, 0.79503106]),
'split40_test_score': array([0.79391932, 0.79391932, 0.79403762, 0.79912457, 0.79776411,
0.79912457, 0.796936 , 0.79776411, 0.79391932, 0.79474743]),
'split41_test_score': array([0.79841476, 0.79841476, 0.80119484, 0.80350172, 0.80344256,
0.80350172, 0.802437 , 0.80344256, 0.79841476, 0.79918372]),
'split42_test_score': array([0.7971726 , 0.7971726 , 0.8014906 , 0.80178635, 0.80385662,
0.80178635, 0.80285106, 0.80385662, 0.7971726 , 0.79906542]),
'split43_test_score': array([0.79533893, 0.79533893, 0.79841476, 0.80154975, 0.80048503,
0.80154975, 0.79829646, 0.80048503, 0.79533893, 0.79533893]),
'split44_test_score': array([0.79078434, 0.79078434, 0.79598959, 0.79817816, 0.79894712,
0.79817816, 0.79776411, 0.79894712, 0.79078434, 0.79143499]),
'split45_test_score': array([0.79989353, 0.79989353, 0.79989353, 0.80533538, 0.80515793,
0.80533538, 0.80409322, 0.80515793, 0.79989353, 0.79989353]),
'split46_test_score': array([0.79598959, 0.79598959, 0.80178635, 0.79995268, 0.80225955,
0.79995268, 0.8023187 , 0.80225955, 0.79598959, 0.80007098]),
'split47_test_score': array([0.79367051, 0.79367051, 0.79710145, 0.80053239, 0.80053239,
0.80053239, 0.79970423, 0.80053239, 0.79367051, 0.79479444]),
'split48_test_score': array([0.79733807, 0.79733807, 0.79822538, 0.80278024, 0.80183378,
0.80278024, 0.8021887 , 0.80183378, 0.79733807, 0.79674652]),
'split49_test_score': array([0.79875776, 0.79875776, 0.80100562, 0.80236616, 0.80236616,
0.80236616, 0.80189293, 0.80236616, 0.79875776, 0.79875776]),
'split50_test_score': array([0.79102094, 0.79102094, 0.79391932, 0.79634449, 0.79628534,
0.79634449, 0.79598959, 0.79628534, 0.79102094, 0.7920265 ]),
'split51_test_score': array([0.79865137, 0.79865137, 0.8018455 , 0.80450728, 0.80391577,
0.80450728, 0.80385662, 0.80391577, 0.79865137, 0.79989353]),
'split52_test_score': array([0.79338696, 0.79338696, 0.79829646, 0.80089909, 0.79936117,
0.80089909, 0.79746835, 0.79936117, 0.79338696, 0.79527978]),
'split53_test_score': array([0.80249616, 0.80249616, 0.80249616, 0.80805631, 0.8072282 ,
0.80805631, 0.8069916 , 0.8072282 , 0.80249616, 0.80249616]),
'split54_test_score': array([0.79551638, 0.79551638, 0.79882882, 0.80308766, 0.80101739,
0.80308766, 0.80107654, 0.80101739, 0.79551638, 0.79723175]),
'split55_test_score': array([0.79516148, 0.79516148, 0.79805986, 0.80089909, 0.80030758,
0.80089909, 0.80066249, 0.80030758, 0.79516148, 0.79516148]),
'split56_test_score': array([0.79533893, 0.79533893, 0.79510233, 0.79995268, 0.7970543 ,
0.79995268, 0.79758666, 0.7970543 , 0.79533893, 0.79533893]),
'split57_test_score': array([0.79662822, 0.79662822, 0.79662822, 0.79970423, 0.80017746,
0.79970423, 0.79917184, 0.80017746, 0.79662822, 0.79514936]),
'split58_test_score': array([0.79822538, 0.79822538, 0.80189293, 0.80461402, 0.80508725,
0.80461402, 0.80461402, 0.80508725, 0.79822538, 0.8006507 ]),
'split59_test_score': array([0.79503106, 0.79503106, 0.79994085, 0.8029577 , 0.79982254,
0.8029577 , 0.800769 , 0.79982254, 0.79503106, 0.79603668]),
'split60_test_score': array([0.80072164, 0.80072164, 0.80574944, 0.80923932, 0.80852952,
0.80923932, 0.80793801, 0.80852952, 0.80072164, 0.80444812]),
'split61_test_score': array([0.79391932, 0.79391932, 0.79835561, 0.80143144, 0.80036673,
0.80143144, 0.80030758, 0.80036673, 0.79391932, 0.79445167]),
'split62_test_score': array([0.79557554, 0.79557554, 0.79942032, 0.8023187 , 0.8018455 ,
0.8023187 , 0.80154975, 0.8018455 , 0.79557554, 0.7974092 ]),
'split63_test_score': array([0.80202295, 0.80202295, 0.80202295, 0.8069916 , 0.80527623,
0.8069916 , 0.80415237, 0.80527623, 0.80202295, 0.80202295]),
'split64_test_score': array([0.79356442, 0.79356442, 0.79610789, 0.79947947, 0.79912457,
0.79947947, 0.79794156, 0.79912457, 0.79356442, 0.79456998]),
'split65_test_score': array([0.79362357, 0.79362357, 0.7970543 , 0.80078079, 0.80113569,
0.80078079, 0.79959778, 0.80113569, 0.79362357, 0.79575299]),
'split66_test_score': array([0.79456998, 0.79456998, 0.79516148, 0.79906542, 0.79900627,
0.79906542, 0.79847391, 0.79900627, 0.79456998, 0.79628534]),
'split67_test_score': array([0.80053239, 0.80053239, 0.80325348, 0.80561964, 0.80508725,
0.80561964, 0.80508725, 0.80508725, 0.80053239, 0.80106477]),
'split68_test_score': array([0.78941142, 0.78941142, 0.79390713, 0.79532683, 0.79455782,
0.79532683, 0.79290151, 0.79455782, 0.78941142, 0.79094942]),
'split69_test_score': array([0.79698314, 0.79698314, 0.79964508, 0.80272109, 0.80337178,
0.80272109, 0.80260278, 0.80337178, 0.79698314, 0.79775214]),
'split70_test_score': array([0.79977523, 0.79977523, 0.80403407, 0.80598604, 0.80716905,
0.80598604, 0.80622264, 0.80716905, 0.79977523, 0.8022004 ]),
'split71_test_score': array([0.79664025, 0.79664025, 0.79664025, 0.79918372, 0.80036673,
0.79918372, 0.80030758, 0.80036673, 0.79664025, 0.79557554]),
'split72_test_score': array([0.79273631, 0.79273631, 0.79397847, 0.79906542, 0.7968177 ,
0.79906542, 0.79687685, 0.7968177 , 0.79273631, 0.79439252]),
'split73_test_score': array([0.79480658, 0.79480658, 0.79959778, 0.80350172, 0.8017272 ,
0.80350172, 0.80131314, 0.8017272 , 0.79480658, 0.79634449]),
'split74_test_score': array([0.79261801, 0.79261801, 0.796936 , 0.79581214, 0.79865137,
0.79581214, 0.79841476, 0.79865137, 0.79261801, 0.79368272]),
'split75_test_score': array([0.79403762, 0.79403762, 0.79735005, 0.8014906 , 0.80178635,
0.8014906 , 0.80018928, 0.80178635, 0.79403762, 0.79557554]),
'split76_test_score': array([0.79510233, 0.79510233, 0.79912457, 0.80166805, 0.80143144,
0.80166805, 0.80060334, 0.80143144, 0.79510233, 0.79587129]),
'split77_test_score': array([0.79917184, 0.79917184, 0.80130139, 0.80319432, 0.80354925,
0.80319432, 0.80467317, 0.80354925, 0.79917184, 0.80106477]),
'split78_test_score': array([0.80230701, 0.80230701, 0.80573795, 0.80899142, 0.80899142,
0.80905058, 0.80692103, 0.80899142, 0.80230701, 0.80307601]),
'split79_test_score': array([0.79361136, 0.79361136, 0.79509021, 0.79881692, 0.79929015,
0.79881692, 0.7985803 , 0.79929015, 0.79361136, 0.79556344]),
'split80_test_score': array([0.79593044, 0.79593044, 0.79616704, 0.80202295, 0.79977523,
0.80202295, 0.79971608, 0.79977523, 0.79593044, 0.79752751]),
'split81_test_score': array([0.79451082, 0.79451082, 0.79699515, 0.80007098, 0.80013013,
0.80007098, 0.80018928, 0.80013013, 0.79451082, 0.79486573]),
'split82_test_score': array([0.79504318, 0.79504318, 0.79989353, 0.80190465, 0.8016089 ,
0.80190465, 0.80066249, 0.8016089 , 0.79504318, 0.79687685]),
'split83_test_score': array([0.79433337, 0.79433337, 0.79433337, 0.79971608, 0.79865137,
0.79971608, 0.79995268, 0.79865137, 0.79433337, 0.79433337]),
'split84_test_score': array([0.79758666, 0.79758666, 0.79805986, 0.80279191, 0.8020821 ,
0.80279191, 0.80113569, 0.8020821 , 0.79758666, 0.79859222]),
'split85_test_score': array([0.7971726 , 0.7971726 , 0.79947947, 0.80267361, 0.80249616,
0.80267361, 0.80013013, 0.80249616, 0.7971726 , 0.79811901]),
'split86_test_score': array([0.79587129, 0.79587129, 0.80078079, 0.8023187 , 0.80119484,
0.8023187 , 0.80101739, 0.80119484, 0.79587129, 0.7972909 ]),
'split87_test_score': array([0.79893523, 0.79893523, 0.80461402, 0.80792665, 0.80680272,
0.80792665, 0.80561964, 0.80680272, 0.79893523, 0.80153801]),
'split88_test_score': array([0.79272405, 0.79272405, 0.79645075, 0.799231 , 0.79905353,
0.799231 , 0.79840284, 0.79905353, 0.79272405, 0.79491275]),
'split89_test_score': array([0.79798876, 0.79798876, 0.8006507 , 0.80313517, 0.80230701,
0.80313517, 0.80272109, 0.80230701, 0.79798876, 0.79816622]),
'split90_test_score': array([0.79297291, 0.79297291, 0.79593044, 0.79865137, 0.79770496,
0.79865137, 0.79776411, 0.79770496, 0.79297291, 0.79456998]),
'split91_test_score': array([0.79285461, 0.79285461, 0.79569384, 0.79735005, 0.79776411,
0.79735005, 0.79800071, 0.79776411, 0.79285461, 0.79468828]),
'split92_test_score': array([0.79380102, 0.79380102, 0.79811901, 0.80066249, 0.79995268,
0.80066249, 0.80007098, 0.79995268, 0.79380102, 0.79380102]),
'split93_test_score': array([0.79332781, 0.79332781, 0.79847391, 0.80007098, 0.80089909,
0.80007098, 0.80119484, 0.80089909, 0.79332781, 0.79593044]),
'split94_test_score': array([0.79776411, 0.79776411, 0.79906542, 0.80273276, 0.80261446,
0.80273276, 0.80166805, 0.80261446, 0.79776411, 0.79847391]),
'split95_test_score': array([0.7970543 , 0.7970543 , 0.79646279, 0.80137229, 0.8014906 ,
0.80137229, 0.80060334, 0.8014906 , 0.7970543 , 0.79800071]),
'split96_test_score': array([0.8022004 , 0.8022004 , 0.8022004 , 0.80657755, 0.8068733 ,
0.80657755, 0.80569029, 0.8068733 , 0.8022004 , 0.8022004 ]),
'split97_test_score': array([0.79585921, 0.79585921, 0.80088731, 0.80337178, 0.80189293,
0.80337178, 0.80183378, 0.80189293, 0.79585921, 0.79733807]),
'split98_test_score': array([0.79739722, 0.79739722, 0.80023662, 0.80278024, 0.80183378,
0.80278024, 0.80183378, 0.80183378, 0.79739722, 0.79727891]),
'split99_test_score': array([0.79822538, 0.79822538, 0.8021887 , 0.8043774 , 0.80496894,
0.8043774 , 0.80112393, 0.80496894, 0.79822538, 0.79905353]),
'split100_test_score': array([0.78889152, 0.78889152, 0.79297291, 0.79776411, 0.79575299,
0.79776411, 0.79196735, 0.79575299, 0.78889152, 0.79031113]),
'split101_test_score': array([0.79415592, 0.79415592, 0.79894712, 0.80054419, 0.80125399,
0.80054419, 0.79977523, 0.80125399, 0.79415592, 0.79539808]),
'split102_test_score': array([0.79871052, 0.79871052, 0.79871052, 0.80308766, 0.80202295,
0.80308766, 0.8022004 , 0.80202295, 0.79871052, 0.79871052]),
'split103_test_score': array([0.79279546, 0.79279546, 0.79622619, 0.79634449, 0.79758666,
0.79634449, 0.7971726 , 0.79758666, 0.79279546, 0.79273631]),
'split104_test_score': array([0.79989353, 0.79989353, 0.80344256, 0.80669585, 0.80752396,
0.80669585, 0.80616349, 0.80752396, 0.79989353, 0.80154975]),
'split105_test_score': array([0.79735005, 0.79735005, 0.79794156, 0.80137229, 0.80078079,
0.80137229, 0.80083994, 0.80078079, 0.79735005, 0.79829646]),
'split106_test_score': array([0.79616704, 0.79616704, 0.79865137, 0.79989353, 0.80137229,
0.79989353, 0.80202295, 0.80137229, 0.79616704, 0.79687685]),
'split107_test_score': array([0.79769299, 0.79769299, 0.79828453, 0.80372671, 0.80118308,
0.80372671, 0.80124224, 0.80118308, 0.79769299, 0.79863946]),
'split108_test_score': array([0.79899438, 0.79899438, 0.80396332, 0.80331263, 0.80349009,
0.80331263, 0.80260278, 0.80349009, 0.79899438, 0.79893523]),
'split109_test_score': array([0.7993493 , 0.7993493 , 0.7993493 , 0.80467317, 0.80443656,
0.80467317, 0.80372671, 0.80443656, 0.7993493 , 0.79792961]),
'split110_test_score': array([0.79421507, 0.79421507, 0.79800071, 0.80131314, 0.80119484,
0.80131314, 0.80042588, 0.80119484, 0.79421507, 0.79569384]),
'split111_test_score': array([0.79805986, 0.79805986, 0.80462558, 0.80557199, 0.80628179,
0.80557199, 0.80563114, 0.80628179, 0.79805986, 0.80291021]),
'split112_test_score': array([0.79959778, 0.79959778, 0.80237785, 0.80492133, 0.80563114,
0.80492133, 0.80438897, 0.80563114, 0.79959778, 0.80060334]),
'split113_test_score': array([0.79048858, 0.79048858, 0.79344611, 0.79646279, 0.79746835,
0.79646279, 0.79557554, 0.79746835, 0.79048858, 0.79161245]),
'split114_test_score': array([0.79326866, 0.79326866, 0.79504318, 0.79758666, 0.79794156,
0.79758666, 0.79581214, 0.79794156, 0.79326866, 0.79451082]),
'split115_test_score': array([0.7917899 , 0.7917899 , 0.79480658, 0.79445167, 0.79622619,
0.79445167, 0.79397847, 0.79622619, 0.7917899 , 0.79303206]),
'split116_test_score': array([0.80001183, 0.80001183, 0.80450728, 0.8072282 , 0.8072282 ,
0.8072282 , 0.8069916 , 0.8072282 , 0.80001183, 0.80101739]),
'split117_test_score': array([0.79556344, 0.79556344, 0.8 , 0.80343094, 0.80118308,
0.80343094, 0.80124224, 0.80118308, 0.79556344, 0.79633245]),
'split118_test_score': array([0.79834369, 0.79834369, 0.79982254, 0.80490979, 0.80183378,
0.80490979, 0.80165631, 0.80183378, 0.79834369, 0.79905353]),
'split119_test_score': array([0.79757468, 0.79757468, 0.79994085, 0.80124224, 0.80236616,
0.80124224, 0.80260278, 0.80236616, 0.79757468, 0.79905353]),
'split120_test_score': array([0.79557554, 0.79557554, 0.79800071, 0.80042588, 0.80154975,
0.80042588, 0.80030758, 0.80154975, 0.79557554, 0.79557554]),
'split121_test_score': array([0.80007098, 0.80007098, 0.80131314, 0.80622264, 0.80580859,
0.80622264, 0.80432982, 0.80580859, 0.80007098, 0.80255531]),
'split122_test_score': array([0.80119484, 0.80119484, 0.80367917, 0.80645925, 0.80598604,
0.80645925, 0.80545369, 0.80598604, 0.80119484, 0.80119484]),
'split123_test_score': array([0.79474743, 0.79474743, 0.79746835, 0.79900627, 0.79942032,
0.79900627, 0.79947947, 0.79942032, 0.79474743, 0.79563469]),
'split124_test_score': array([0.79557554, 0.79557554, 0.80060334, 0.80350172, 0.80356087,
0.80344256, 0.80249616, 0.80356087, 0.79557554, 0.79888797]),
'split125_test_score': array([0.78753105, 0.78753105, 0.79362357, 0.79474743, 0.79291376,
0.79474743, 0.79273631, 0.79291376, 0.78753105, 0.78954217]),
'split126_test_score': array([0.79776411, 0.79776411, 0.79865137, 0.80285106, 0.8019638 ,
0.80285106, 0.79894712, 0.8019638 , 0.79776411, 0.79900627]),
'split127_test_score': array([0.79585921, 0.79585921, 0.7985803 , 0.80337178, 0.80195209,
0.80337178, 0.8014197 , 0.80195209, 0.79585921, 0.79656906]),
'split128_test_score': array([0.79680568, 0.79680568, 0.79680568, 0.79964508, 0.79834369,
0.79964508, 0.79621414, 0.79834369, 0.79680568, 0.79686483]),
'split129_test_score': array([0.79674652, 0.79674652, 0.80053239, 0.80301686, 0.80260278,
0.80301686, 0.80242532, 0.80260278, 0.79674652, 0.79834369]),
'split130_test_score': array([0.79350526, 0.79350526, 0.79865137, 0.80030758, 0.79936117,
0.80030758, 0.79835561, 0.79936117, 0.79350526, 0.79581214]),
'split131_test_score': array([0.79090264, 0.79090264, 0.79800071, 0.80013013, 0.79894712,
0.80013013, 0.79811901, 0.79894712, 0.79090264, 0.79155329]),
'split132_test_score': array([0.79527978, 0.79527978, 0.79947947, 0.80137229, 0.79995268,
0.80137229, 0.80083994, 0.79995268, 0.79527978, 0.79616704]),
'split133_test_score': array([0.79817816, 0.79817816, 0.79817816, 0.79983438, 0.8019638 ,
0.79983438, 0.80190465, 0.8019638 , 0.79817816, 0.79817816]),
'split134_test_score': array([0.79486573, 0.79486573, 0.79557554, 0.80089909, 0.8014906 ,
0.80089909, 0.79942032, 0.8014906 , 0.79486573, 0.79699515]),
'split135_test_score': array([0.79800071, 0.79800071, 0.79758666, 0.80314681, 0.80101739,
0.80314681, 0.80107654, 0.80101739, 0.79800071, 0.79876967]),
'split136_test_score': array([0.79835561, 0.79835561, 0.79942032, 0.80468473, 0.80249616,
0.80468473, 0.80267361, 0.80249616, 0.79835561, 0.79900627]),
'split137_test_score': array([0.7970423 , 0.7970423 , 0.79739722, 0.80201124, 0.8006507 ,
0.80201124, 0.80011831, 0.8006507 , 0.7970423 , 0.79769299]),
'split138_test_score': array([0.8006507 , 0.8006507 , 0.8006507 , 0.80591541, 0.80461402,
0.80591541, 0.80449571, 0.80461402, 0.8006507 , 0.79929015]),
'split139_test_score': array([0.79727891, 0.79727891, 0.80431825, 0.80698018, 0.80538302,
0.80698018, 0.80597456, 0.80538302, 0.79727891, 0.79911269]),
'split140_test_score': array([0.79764581, 0.79764581, 0.80089909, 0.80154975, 0.80178635,
0.80154975, 0.80101739, 0.80178635, 0.79764581, 0.79557554]),
'split141_test_score': array([0.79800071, 0.79800071, 0.80101739, 0.80373832, 0.80427067,
0.80373832, 0.80427067, 0.80427067, 0.79800071, 0.79953863]),
'split142_test_score': array([0.79510233, 0.79510233, 0.79930202, 0.80279191, 0.80089909,
0.80279191, 0.80048503, 0.80089909, 0.79510233, 0.7972909 ]),
'split143_test_score': array([0.7968177 , 0.7968177 , 0.79888797, 0.8019638 , 0.80083994,
0.8019638 , 0.80083994, 0.80083994, 0.7968177 , 0.7968177 ]),
'split144_test_score': array([0.79776411, 0.79776411, 0.79947947, 0.79942032, 0.8016089 ,
0.79942032, 0.80007098, 0.8016089 , 0.79776411, 0.79640364]),
'split145_test_score': array([0.79001538, 0.79001538, 0.79273631, 0.79699515, 0.79835561,
0.79699515, 0.79474743, 0.79835561, 0.79001538, 0.79137584]),
'split146_test_score': array([0.79285461, 0.79285461, 0.79598959, 0.80018928, 0.79829646,
0.80018928, 0.79805986, 0.79829646, 0.79285461, 0.79350526]),
'split147_test_score': array([0.79260574, 0.79260574, 0.79609583, 0.79834369, 0.799231 ,
0.79834369, 0.79881692, 0.799231 , 0.79260574, 0.79349305]),
'split148_test_score': array([0.80343094, 0.80343094, 0.80810411, 0.81023366, 0.80993789,
0.81023366, 0.80905058, 0.80993789, 0.80343094, 0.80520556]),
'split149_test_score': array([0.79538598, 0.79538598, 0.80029577, 0.80236616, 0.80171547,
0.80236616, 0.80189293, 0.80171547, 0.79538598, 0.79739722]),
'mean_test_score': array([0.79612163, 0.79612163, 0.79894356, 0.80201196, 0.80163181,
0.80201196, 0.80097325, 0.80163141, 0.79612163, 0.79712169]),
'std_test_score': array([0.00313568, 0.00313568, 0.0030386 , 0.00315658, 0.00309564,
0.00315727, 0.00319488, 0.00309578, 0.00313568, 0.00309077]),
'rank_test_score': array([8, 8, 6, 2, 3, 1, 5, 4, 8, 7]),
'split0_train_score': array([0.79626818, 0.79626818, 0.79976471, 0.80358985, 0.80344526,
0.80358985, 0.80209792, 0.80344526, 0.79626818, 0.79795729]),
'split1_train_score': array([0.79625504, 0.79625504, 0.79985015, 0.80305092, 0.80337297,
0.80305092, 0.80230166, 0.80337297, 0.79625504, 0.7980033 ]),
'split2_train_score': array([0.79728691, 0.79728691, 0.79728691, 0.80311664, 0.80240025,
0.80311664, 0.80142096, 0.80240025, 0.79728691, 0.79728691]),
'split3_train_score': array([0.79568981, 0.79568981, 0.79931121, 0.80332039, 0.80313636,
0.80332039, 0.80167728, 0.80313636, 0.79568981, 0.79654422]),
'split4_train_score': array([0.79593299, 0.79593299, 0.79928492, 0.80337954, 0.80186788,
0.80337954, 0.80165099, 0.80186788, 0.79593299, 0.79676111]),
'split5_train_score': array([0.79755637, 0.79755637, 0.79951496, 0.80337954, 0.80211763,
0.80337954, 0.80177587, 0.80211763, 0.79755637, 0.79755637]),
'split6_train_score': array([0.79554521, 0.79554521, 0.79877227, 0.80311664, 0.80186131,
0.80311664, 0.8015064 , 0.80186131, 0.79554521, 0.79724747]),
'split7_train_score': array([0.7973934 , 0.7973934 , 0.79997634, 0.80364372, 0.80329539,
0.80364372, 0.8023161 , 0.80329539, 0.7973934 , 0.7973934 ]),
'split8_train_score': array([0.79723566, 0.79723566, 0.79723566, 0.80265787, 0.80287476,
0.80265787, 0.80152742, 0.80287476, 0.79723566, 0.79696619]),
'split9_train_score': array([0.79569115, 0.79569115, 0.79906278, 0.80314422, 0.80154056,
0.80314422, 0.80146827, 0.80154056, 0.79569115, 0.79734739]),
'split10_train_score': array([0.79616959, 0.79616959, 0.79944923, 0.80351756, 0.80309692,
0.80351756, 0.80197961, 0.80309692, 0.79616959, 0.79740521]),
'split11_train_score': array([0.79634705, 0.79634705, 0.79959382, 0.80266314, 0.80349127,
0.80266314, 0.80077686, 0.80349127, 0.79634705, 0.7980033 ]),
'split12_train_score': array([0.79587384, 0.79587384, 0.79946238, 0.80320208, 0.80344526,
0.80320208, 0.8014341 , 0.80344526, 0.79587384, 0.79831878]),
'split13_train_score': array([0.7959527 , 0.7959527 , 0.79920605, 0.80279459, 0.80177587,
0.80279459, 0.80175615, 0.80177587, 0.7959527 , 0.79753009]),
'split14_train_score': array([0.79642592, 0.79642592, 0.79925206, 0.80270258, 0.80178901,
0.80270258, 0.80138152, 0.80178901, 0.79642592, 0.79642592]),
'split15_train_score': array([0.79629447, 0.79629447, 0.79956096, 0.80370159, 0.80269601,
0.80370159, 0.80238053, 0.80269601, 0.79629447, 0.79798358]),
'split16_train_score': array([0.79568981, 0.79568981, 0.79941637, 0.80226223, 0.80189417,
0.80226223, 0.80148011, 0.80189417, 0.79568981, 0.79733291]),
'split17_train_score': array([0.7974394 , 0.7974394 , 0.7974394 , 0.80286818, 0.80270388,
0.80286818, 0.80158 , 0.80270388, 0.7974394 , 0.7974394 ]),
'split18_train_score': array([0.79562543, 0.79562543, 0.79887875, 0.80308507, 0.8016983 ,
0.80308507, 0.80124481, 0.8016983 , 0.79562543, 0.79702534]),
'split19_train_score': array([0.7974394 , 0.7974394 , 0.79914822, 0.80308507, 0.80313765,
0.80308507, 0.80156685, 0.80313765, 0.7974394 , 0.7974394 ]),
'split20_train_score': array([0.79565695, 0.79565695, 0.79937036, 0.80304434, 0.80247254,
0.80304434, 0.80195332, 0.80247254, 0.79565695, 0.79742493]),
'split21_train_score': array([0.79610387, 0.79610387, 0.79939008, 0.80328095, 0.80210449,
0.80328095, 0.80179558, 0.80210449, 0.79610387, 0.79691885]),
'split22_train_score': array([0.79542691, 0.79542691, 0.79881828, 0.80281431, 0.80261056,
0.80281431, 0.80115149, 0.80261056, 0.79542691, 0.79624189]),
'split23_train_score': array([0.79528232, 0.79528232, 0.79900888, 0.80237396, 0.80157869,
0.80237396, 0.80130265, 0.80157869, 0.79528232, 0.79702401]),
'split24_train_score': array([0.79646535, 0.79646535, 0.79963983, 0.80309035, 0.80349784,
0.80309035, 0.80080972, 0.80349784, 0.79646535, 0.79817418]),
'split25_train_score': array([0.79542034, 0.79542034, 0.79910089, 0.80283403, 0.80161156,
0.80283403, 0.7996464 , 0.80161156, 0.79542034, 0.79711602]),
'split26_train_score': array([0.79559779, 0.79559779, 0.79925863, 0.80378703, 0.8025777 ,
0.80378703, 0.80180873, 0.8025777 , 0.79559779, 0.79639963]),
'split27_train_score': array([0.79678217, 0.79678217, 0.79984489, 0.80427467, 0.80367001,
0.80427467, 0.80271045, 0.80367001, 0.79678217, 0.79850413]),
'split28_train_score': array([0.79776802, 0.79776802, 0.79776802, 0.80339397, 0.80225038,
0.80339397, 0.80191519, 0.80225038, 0.79776802, 0.79776802]),
'split29_train_score': array([0.79603291, 0.79603291, 0.79968715, 0.80328224, 0.80253299,
0.80328224, 0.80216494, 0.80253299, 0.79603291, 0.79772201]),
'split30_train_score': array([0.79691228, 0.79691228, 0.7994558 , 0.80364243, 0.80212421,
0.80364243, 0.80188103, 0.80212421, 0.79691228, 0.79777984]),
'split31_train_score': array([0.79598557, 0.79598557, 0.79944266, 0.80349784, 0.80213735,
0.80349784, 0.80177587, 0.80213735, 0.79598557, 0.7977404 ]),
'split32_train_score': array([0.79536119, 0.79536119, 0.79873941, 0.80270915, 0.80279459,
0.80270915, 0.80125007, 0.80279459, 0.79536119, 0.79709631]),
'split33_train_score': array([0.79618274, 0.79618274, 0.79944923, 0.80349784, 0.80305092,
0.80349784, 0.80213078, 0.80305092, 0.79618274, 0.79783899]),
'split34_train_score': array([0.7953809 , 0.7953809 , 0.79877884, 0.80309035, 0.80134209,
0.80309035, 0.80111205, 0.80134209, 0.7953809 , 0.79710945]),
'split35_train_score': array([0.79595928, 0.79595928, 0.79981729, 0.80360957, 0.80240025,
0.80360957, 0.80217678, 0.80240025, 0.79595928, 0.79787842]),
'split36_train_score': array([0.79568981, 0.79568981, 0.79927835, 0.80297862, 0.80307063,
0.80297862, 0.80034308, 0.80307063, 0.79568981, 0.79733291]),
'split37_train_score': array([0.7970122 , 0.7970122 , 0.79960829, 0.80334797, 0.80336768,
0.80334797, 0.8019612 , 0.80336768, 0.7970122 , 0.7978929 ]),
'split38_train_score': array([0.79664415, 0.79664415, 0.79934539, 0.80373574, 0.80309822,
0.80373574, 0.80192833, 0.80309822, 0.79664415, 0.79757085]),
'split39_train_score': array([0.79625638, 0.79625638, 0.79941112, 0.80334797, 0.80351228,
0.80334797, 0.80156028, 0.80351228, 0.79625638, 0.79841211]),
'split40_train_score': array([0.79609073, 0.79609073, 0.79974499, 0.80403021, 0.80340583,
0.80403021, 0.80209792, 0.80340583, 0.79609073, 0.79739864]),
'split41_train_score': array([0.79651793, 0.79651793, 0.7992652 , 0.80301148, 0.80305749,
0.80301148, 0.80184816, 0.80305749, 0.79651793, 0.79741178]),
'split42_train_score': array([0.79572924, 0.79572924, 0.79909432, 0.80167071, 0.80276173,
0.80167071, 0.80125007, 0.80276173, 0.79572924, 0.79710945]),
'split43_train_score': array([0.79783899, 0.79783899, 0.79958725, 0.80376074, 0.80233452,
0.80376074, 0.80042852, 0.80233452, 0.79783899, 0.79783899]),
'split44_train_score': array([0.79643906, 0.79643906, 0.79970556, 0.80283403, 0.80312979,
0.80283403, 0.80209792, 0.80312979, 0.79643906, 0.79729348]),
'split45_train_score': array([0.79635362, 0.79635362, 0.79635362, 0.80274859, 0.80236081,
0.80274859, 0.80138809, 0.80236081, 0.79635362, 0.79635362]),
'split46_train_score': array([0.79586069, 0.79586069, 0.798884 , 0.80318894, 0.80167071,
0.80318894, 0.80119749, 0.80167071, 0.79586069, 0.79785213]),
'split47_train_score': array([0.79611836, 0.79611836, 0.799582 , 0.80273016, 0.80298649,
0.80273016, 0.80188233, 0.80298649, 0.79611836, 0.79692019]),
'split48_train_score': array([0.79571087, 0.79571087, 0.79927967, 0.80346627, 0.8019612 ,
0.80346627, 0.80153399, 0.8019612 , 0.79571087, 0.79736711]),
'split49_train_score': array([0.79745912, 0.79745912, 0.79929938, 0.80368973, 0.80323624,
0.80368973, 0.80168516, 0.80323624, 0.79745912, 0.79745912]),
'split50_train_score': array([0.79641277, 0.79641277, 0.79993559, 0.80293919, 0.80258427,
0.80293919, 0.80228852, 0.80258427, 0.79641277, 0.79722775]),
'split51_train_score': array([0.79556493, 0.79556493, 0.79961354, 0.8034847 , 0.80167728,
0.8034847 , 0.80159841, 0.80167728, 0.79556493, 0.79733291]),
'split52_train_score': array([0.79614988, 0.79614988, 0.79940322, 0.80240025, 0.80230166,
0.80240025, 0.80052054, 0.80230166, 0.79614988, 0.79784556]),
'split53_train_score': array([0.79606444, 0.79606444, 0.79606444, 0.80236081, 0.80230166,
0.80236081, 0.80105947, 0.80230166, 0.79606444, 0.79606444]),
'split54_train_score': array([0.79591327, 0.79591327, 0.79934407, 0.8037936 , 0.80205848,
0.8037936 , 0.80170357, 0.80205848, 0.79591327, 0.79762867]),
'split55_train_score': array([0.79687942, 0.79687942, 0.79961354, 0.80320865, 0.8027683 ,
0.80320865, 0.80177587, 0.8027683 , 0.79687942, 0.79687942]),
'split56_train_score': array([0.79783899, 0.79783899, 0.79995531, 0.80381332, 0.80264343,
0.80381332, 0.8023871 , 0.80264343, 0.79783899, 0.79783899]),
'split57_train_score': array([0.79671644, 0.79671644, 0.79671644, 0.80332168, 0.80345313,
0.80332168, 0.80167201, 0.80345313, 0.79671644, 0.79754456]),
'split58_train_score': array([0.79561228, 0.79561228, 0.79904963, 0.80261186, 0.8017706 ,
0.80261186, 0.80141569, 0.8017706 , 0.79561228, 0.7972488 ]),
'split59_train_score': array([0.79596719, 0.79596719, 0.79926652, 0.80370288, 0.80234239,
0.80370288, 0.80192176, 0.80234239, 0.79596719, 0.79776145]),
'split60_train_score': array([0.7953349 , 0.7953349 , 0.79862111, 0.80253827, 0.80110548,
0.80253827, 0.80068485, 0.80110548, 0.7953349 , 0.79651136]),
'split61_train_score': array([0.79609073, 0.79609073, 0.79944266, 0.80287346, 0.80220965,
0.80287346, 0.80180873, 0.80220965, 0.79609073, 0.79695828]),
'split62_train_score': array([0.7959067 , 0.7959067 , 0.79932436, 0.80368187, 0.80318236,
0.80368187, 0.80180216, 0.80318236, 0.7959067 , 0.79760895]),
'split63_train_score': array([0.79709631, 0.79709631, 0.79709631, 0.80321523, 0.8023871 ,
0.80321523, 0.80002103, 0.8023871 , 0.79709631, 0.79709631]),
'split64_train_score': array([0.79613016, 0.79613016, 0.80037594, 0.80380017, 0.80320865,
0.80380017, 0.80234767, 0.80320865, 0.79613016, 0.79792443]),
'split65_train_score': array([0.79612359, 0.79612359, 0.7994098 , 0.80320208, 0.80324152,
0.80320208, 0.80197304, 0.80324152, 0.79612359, 0.79779298]),
'split66_train_score': array([0.79601843, 0.79601843, 0.79979757, 0.80323494, 0.80347812,
0.80323494, 0.8021965 , 0.80347812, 0.79601843, 0.79773383]),
'split67_train_score': array([0.79628266, 0.79628266, 0.79903649, 0.80355171, 0.80163258,
0.80355828, 0.80155371, 0.80163258, 0.79628266, 0.7972028 ]),
'split68_train_score': array([0.79659157, 0.79659157, 0.79993691, 0.80393948, 0.80306536,
0.80393948, 0.80105421, 0.80306536, 0.79659157, 0.79832667]),
'split69_train_score': array([0.7957503 , 0.7957503 , 0.79929938, 0.80261843, 0.80309822,
0.80261843, 0.80127767, 0.80309822, 0.7957503 , 0.79725538]),
'split70_train_score': array([0.79544006, 0.79544006, 0.79881171, 0.80261056, 0.80265657,
0.80261056, 0.80108576, 0.80265657, 0.79544006, 0.79676111]),
'split71_train_score': array([0.7976944 , 0.7976944 , 0.7976944 , 0.80381332, 0.80209134,
0.80381332, 0.80167071, 0.80209134, 0.7976944 , 0.79683341]),
'split72_train_score': array([0.79622217, 0.79622217, 0.79992902, 0.80328095, 0.80254484,
0.80328095, 0.80245283, 0.80254484, 0.79622217, 0.79794415]),
'split73_train_score': array([0.79599214, 0.79599214, 0.79930464, 0.8034847 , 0.80227537,
0.8034847 , 0.80197304, 0.80227537, 0.79599214, 0.79772726]),
'split74_train_score': array([0.79623532, 0.79623532, 0.79955439, 0.80298519, 0.80234767,
0.80298519, 0.80211106, 0.80234767, 0.79623532, 0.79802302]),
'split75_train_score': array([0.79607758, 0.79607758, 0.79950838, 0.80353728, 0.80291947,
0.80353728, 0.80209792, 0.80291947, 0.79607758, 0.7978127 ]),
'split76_train_score': array([0.79595928, 0.79595928, 0.79931121, 0.8035307 , 0.80203219,
0.8035307 , 0.80177587, 0.80203219, 0.79595928, 0.79746436]),
'split77_train_score': array([0.79550712, 0.79550712, 0.7989379 , 0.80332168, 0.80303249,
0.80332168, 0.8015077 , 0.80303249, 0.79550712, 0.7972028 ]),
'split78_train_score': array([0.79515879, 0.79515879, 0.79930596, 0.80221095, 0.8020795 ,
0.80221095, 0.80097534, 0.8020795 , 0.79515879, 0.79600005]),
'split79_train_score': array([0.79612493, 0.79612493, 0.79980546, 0.80299963, 0.80339397,
0.80299963, 0.80227667, 0.80339397, 0.79612493, 0.79781403]),
'split80_train_score': array([0.79586726, 0.79586726, 0.79968584, 0.80336639, 0.80225565,
0.80336639, 0.8021505 , 0.80225565, 0.79586726, 0.79759581]),
'split81_train_score': array([0.796025 , 0.796025 , 0.79941637, 0.80363586, 0.80216364,
0.80363586, 0.80175615, 0.80216364, 0.796025 , 0.79757609]),
'split82_train_score': array([0.79596585, 0.79596585, 0.79927178, 0.80293919, 0.80318236,
0.80293919, 0.80061912, 0.80318236, 0.79596585, 0.79766811]),
'split83_train_score': array([0.79795072, 0.79795072, 0.79795072, 0.80357014, 0.80326123,
0.80357014, 0.80194018, 0.80326123, 0.79795072, 0.79795072]),
'split84_train_score': array([0.79568324, 0.79568324, 0.79947552, 0.80325466, 0.80313636,
0.80325466, 0.80199276, 0.80313636, 0.79568324, 0.79747751]),
'split85_train_score': array([0.79572924, 0.79572924, 0.79931778, 0.80278145, 0.80278802,
0.80278145, 0.80005389, 0.80278802, 0.79572924, 0.79655079]),
'split86_train_score': array([0.79587384, 0.79587384, 0.79899573, 0.8035307 , 0.80199933,
0.8035307 , 0.80178244, 0.80199933, 0.79587384, 0.7976221 ]),
'split87_train_score': array([0.79553341, 0.79553341, 0.79874731, 0.80263158, 0.80261843,
0.80263158, 0.80121852, 0.80261843, 0.79553341, 0.79683474]),
'split88_train_score': array([0.79622351, 0.79622351, 0.79947684, 0.80361744, 0.80362401,
0.80361744, 0.80228982, 0.80362401, 0.79622351, 0.79788632]),
'split89_train_score': array([0.79563857, 0.79563857, 0.7990102 , 0.80242784, 0.80194148,
0.80242784, 0.80147484, 0.80194148, 0.79563857, 0.79720937]),
'split90_train_score': array([0.79619588, 0.79619588, 0.79953467, 0.80391848, 0.80247912,
0.80391848, 0.80236081, 0.80247912, 0.79619588, 0.79792443]),
'split91_train_score': array([0.79620903, 0.79620903, 0.79973842, 0.80305092, 0.80270915,
0.80305092, 0.8021505 , 0.80270915, 0.79620903, 0.79791129]),
'split92_train_score': array([0.79703058, 0.79703058, 0.79960697, 0.80358328, 0.80215707,
0.80358328, 0.80182845, 0.80215707, 0.79703058, 0.79703058]),
'split93_train_score': array([0.79615645, 0.79615645, 0.79925206, 0.80370816, 0.80326123,
0.80370816, 0.80138152, 0.80326123, 0.79615645, 0.79745779]),
'split94_train_score': array([0.79566352, 0.79566352, 0.79936379, 0.80266972, 0.80283403,
0.80266972, 0.80165756, 0.80283403, 0.79566352, 0.79651136]),
'split95_train_score': array([0.79574239, 0.79574239, 0.79965298, 0.80333353, 0.80326123,
0.80333353, 0.80203219, 0.80326123, 0.79574239, 0.79795729]),
'split96_train_score': array([0.79707659, 0.79707659, 0.79707659, 0.80263685, 0.80237396,
0.80263685, 0.80113834, 0.80237396, 0.79707659, 0.79707659]),
'split97_train_score': array([0.79587518, 0.79587518, 0.79916136, 0.80335454, 0.80199406,
0.80335454, 0.80191519, 0.80199406, 0.79587518, 0.79761686]),
'split98_train_score': array([0.7957043 , 0.7957043 , 0.79905621, 0.80327567, 0.80182975,
0.80327567, 0.80163915, 0.80182975, 0.7957043 , 0.79730796]),
'split99_train_score': array([0.79653899, 0.79653899, 0.79895762, 0.80290105, 0.80256586,
0.80290105, 0.80011436, 0.80256586, 0.79653899, 0.79742626]),
'split100_train_score': array([0.79664938, 0.79664938, 0.80072428, 0.80437854, 0.80387247,
0.80437854, 0.80140781, 0.80387247, 0.79664938, 0.79839764]),
'split101_train_score': array([0.79606444, 0.79606444, 0.79937693, 0.8025777 , 0.80305749,
0.8025777 , 0.80148668, 0.80305749, 0.79606444, 0.79614331]),
'split102_train_score': array([0.79746436, 0.79746436, 0.79746436, 0.80327438, 0.80311007,
0.80327438, 0.80187445, 0.80311007, 0.79746436, 0.79746436]),
'split103_train_score': array([0.7962156 , 0.7962156 , 0.79967927, 0.80332039, 0.80222936,
0.80332039, 0.801697 , 0.80222936, 0.7962156 , 0.79643906]),
'split104_train_score': array([0.79542691, 0.79542691, 0.79869998, 0.80231481, 0.80236081,
0.80231481, 0.8012435 , 0.80236081, 0.79542691, 0.79714888]),
'split105_train_score': array([0.79570953, 0.79570953, 0.79931121, 0.80242654, 0.80243311,
0.80242654, 0.8015064 , 0.80243311, 0.79570953, 0.79751037]),
'split106_train_score': array([0.79584097, 0.79584097, 0.7994098 , 0.80222936, 0.80286032,
0.80222936, 0.8016247 , 0.80286032, 0.79584097, 0.79668882]),
'split107_train_score': array([0.79659814, 0.79659814, 0.7993914 , 0.80357143, 0.8018889 ,
0.80357143, 0.80159972, 0.8018889 , 0.79659814, 0.79747226]),
'split108_train_score': array([0.79645355, 0.79645355, 0.79876045, 0.80351885, 0.8017706 ,
0.80351885, 0.80144855, 0.8017706 , 0.79645355, 0.79712393]),
'split109_train_score': array([0.7973934 , 0.7973934 , 0.7973934 , 0.80295363, 0.80182318,
0.80295363, 0.80143541, 0.80182318, 0.7973934 , 0.79657185]),
'split110_train_score': array([0.79605786, 0.79605786, 0.79948209, 0.80377388, 0.80309035,
0.80377388, 0.8018876 , 0.80309035, 0.79605786, 0.79779955]),
'split111_train_score': array([0.79563066, 0.79563066, 0.79874598, 0.80215707, 0.80139467,
0.80215707, 0.80088859, 0.80139467, 0.79563066, 0.79668224]),
'split112_train_score': array([0.79545977, 0.79545977, 0.79899573, 0.80218993, 0.80266314,
0.80218993, 0.80135523, 0.80266314, 0.79545977, 0.79627475]),
'split113_train_score': array([0.79647193, 0.79647193, 0.79981071, 0.80434568, 0.80374102,
0.80434568, 0.8020059 , 0.80374102, 0.79647193, 0.79793758]),
'split114_train_score': array([0.79616302, 0.79616302, 0.79981071, 0.80358985, 0.80366215,
0.80358985, 0.80097403, 0.80366215, 0.79616302, 0.797931 ]),
'split115_train_score': array([0.79632733, 0.79632733, 0.799791 , 0.80391848, 0.80364243,
0.80391848, 0.80225565, 0.80364243, 0.79632733, 0.79777984]),
'split116_train_score': array([0.79541377, 0.79541377, 0.79875913, 0.80220307, 0.80193361,
0.80220307, 0.80106605, 0.80193361, 0.79541377, 0.79622875]),
'split117_train_score': array([0.79590804, 0.79590804, 0.79921394, 0.80308507, 0.80198091,
0.80308507, 0.80170487, 0.80198091, 0.79590804, 0.7967493 ]),
'split118_train_score': array([0.79652584, 0.79652584, 0.79941769, 0.8031508 , 0.8020072 ,
0.8031508 , 0.80193491, 0.8020072 , 0.79652584, 0.79742626]),
'split119_train_score': array([0.79568458, 0.79568458, 0.79908907, 0.80345313, 0.80213865,
0.80345313, 0.80174431, 0.80213865, 0.79568458, 0.79742626]),
'split120_train_score': array([0.7978127 , 0.7978127 , 0.79963326, 0.8031035 , 0.80322837,
0.8031035 , 0.80190074, 0.80322837, 0.7978127 , 0.7978127 ]),
'split121_train_score': array([0.79540719, 0.79540719, 0.79911404, 0.80324809, 0.80287346,
0.80324809, 0.80163127, 0.80287346, 0.79540719, 0.79703715]),
'split122_train_score': array([0.79620903, 0.79620903, 0.79898916, 0.80241996, 0.8023871 ,
0.80241996, 0.80118435, 0.8023871 , 0.79620903, 0.79620903]),
'split123_train_score': array([0.79599871, 0.79599871, 0.79954125, 0.80311007, 0.80209134,
0.80311007, 0.80190074, 0.80209134, 0.79599871, 0.79682684]),
'split124_train_score': array([0.7959067 , 0.7959067 , 0.79901545, 0.8033401 , 0.80324152,
0.8033401 , 0.80157869, 0.80324152, 0.7959067 , 0.79712917]),
'split125_train_score': array([0.79680055, 0.79680055, 0.80065198, 0.80390533, 0.80306406,
0.80390533, 0.80281431, 0.80306406, 0.79680055, 0.79816761]),
'split126_train_score': array([0.79566352, 0.79566352, 0.79923891, 0.80320208, 0.80315608,
0.80320208, 0.80062569, 0.80315608, 0.79566352, 0.7974315 ]),
'split127_train_score': array([0.79587518, 0.79587518, 0.79924023, 0.80336111, 0.80174431,
0.80336111, 0.80148799, 0.80174431, 0.79587518, 0.79672301]),
'split128_train_score': array([0.79767601, 0.79767601, 0.79767601, 0.80336768, 0.80351228,
0.80336768, 0.80169173, 0.80351228, 0.79767601, 0.79735396]),
'split129_train_score': array([0.79577659, 0.79577659, 0.79902335, 0.80333482, 0.80219123,
0.80333482, 0.80184289, 0.80219123, 0.79577659, 0.79750513]),
'split130_train_score': array([0.79613673, 0.79613673, 0.7994098 , 0.80397106, 0.80218993,
0.80397106, 0.80172986, 0.80218993, 0.79613673, 0.79747093]),
'split131_train_score': array([0.79642592, 0.79642592, 0.79948209, 0.80318894, 0.80232795,
0.80318894, 0.80064541, 0.80232795, 0.79642592, 0.79728033]),
'split132_train_score': array([0.79593956, 0.79593956, 0.79931778, 0.8029129 , 0.80339268,
0.8029129 , 0.80202562, 0.80339268, 0.79593956, 0.79774698]),
'split133_train_score': array([0.79752351, 0.79752351, 0.79752351, 0.80318236, 0.80201247,
0.80318236, 0.80190732, 0.80201247, 0.79752351, 0.79752351]),
'split134_train_score': array([0.79598557, 0.79598557, 0.79957411, 0.80349127, 0.80297862,
0.80349127, 0.80218336, 0.80297862, 0.79598557, 0.79765496]),
'split135_train_score': array([0.79656394, 0.79656394, 0.79966612, 0.80364901, 0.80174301,
0.80364901, 0.80161813, 0.80174301, 0.79656394, 0.79745779]),
'split136_train_score': array([0.79559779, 0.79559779, 0.7991469 , 0.80321523, 0.80255799,
0.80321523, 0.80160498, 0.80255799, 0.79559779, 0.79645221]),
'split137_train_score': array([0.79667043, 0.79667043, 0.79948998, 0.80367001, 0.8033414 ,
0.80367001, 0.80210579, 0.8033414 , 0.79667043, 0.79757742]),
'split138_train_score': array([0.7972488 , 0.7972488 , 0.7972488 , 0.8031968 , 0.80171802,
0.8031968 , 0.80134997, 0.80171802, 0.7972488 , 0.79642068]),
'split139_train_score': array([0.79571744, 0.79571744, 0.79878017, 0.80234239, 0.80275645,
0.80234239, 0.8012711 , 0.80275645, 0.79571744, 0.79741969]),
'split140_train_score': array([0.79567666, 0.79567666, 0.79916004, 0.80323494, 0.80305092,
0.80323494, 0.80172986, 0.80305092, 0.79567666, 0.79749722]),
'split141_train_score': array([0.79563723, 0.79563723, 0.7991469 , 0.80295233, 0.80174958,
0.80295233, 0.80163785, 0.80174958, 0.79563723, 0.79737235]),
'split142_train_score': array([0.79595928, 0.79595928, 0.79916004, 0.80317579, 0.80198618,
0.80317579, 0.80151297, 0.80198618, 0.79595928, 0.79730662]),
'split143_train_score': array([0.79767468, 0.79767468, 0.79953467, 0.80378045, 0.80212421,
0.80378045, 0.8020782 , 0.80212421, 0.79767468, 0.79767468]),
'split144_train_score': array([0.79566352, 0.79566352, 0.79931778, 0.80242654, 0.80083601,
0.80242654, 0.80026421, 0.80083601, 0.79566352, 0.79603157]),
'split145_train_score': array([0.79652451, 0.79652451, 0.79988958, 0.80402363, 0.80352413,
0.80402363, 0.80100689, 0.80352413, 0.79652451, 0.79827934]),
'split146_train_score': array([0.79620903, 0.79620903, 0.7995281 , 0.80343869, 0.80311007,
0.80343869, 0.80205848, 0.80311007, 0.79620903, 0.79706344]),
'split147_train_score': array([0.79623666, 0.79623666, 0.79969373, 0.80276303, 0.80317709,
0.80276303, 0.80205321, 0.80317709, 0.79623666, 0.79753799]),
'split148_train_score': array([0.79503391, 0.79503391, 0.79904306, 0.80297991, 0.80234239,
0.80297991, 0.80092276, 0.80234239, 0.79503391, 0.79674273]),
'split149_train_score': array([0.79592776, 0.79592776, 0.79922709, 0.80322309, 0.80195462,
0.80322309, 0.8018166 , 0.80195462, 0.79592776, 0.79761028]),
'mean_train_score': array([0.79621754, 0.79621754, 0.79914988, 0.80319296, 0.80257796,
0.80319301, 0.80159946, 0.80257796, 0.79621754, 0.79734926]),
'std_train_score': array([0.00065176, 0.00065176, 0.00075839, 0.00049259, 0.0006383 ,
0.00049262, 0.0005407 , 0.0006383 , 0.00065176, 0.00054179])}
rand_cv.best_params_
{'min_samples_split': 15, 'max_depth': 6, 'criterion': 'entropy'}
2.- GridSearchCV
Explora todas las combinaciones posibles del diccionario de hiperparametros que le pasemos
st = time.time()
# Lo invoco
grid_cv = GridSearchCV(
estimator = DecisionTreeClassifier(), # le paso el modelo
param_grid = param_grid, # le paso el diccionario de imputaciones e hiperparametros
cv = cv, # le paso el número de veces que repite el kfolk cross validation
return_train_score = True, # incluye los scores del train (para tener en cuenta el overfitting o underfitting)
verbose = False,
n_jobs = -1 # para usar todos los procesadores
)
# Lo entreno pasándole el train
grid_cv.fit(X_train, y_train)
# Obtengo el score del mejor estimador
score_grid = grid_cv.score(X_train, y_train)
et = time.time()
tt_grid = round((et - st)/60, 2)
print("Total time took: ", str(tt_grid), " minutes!")
print(f'El score del mejor estimador es: {round(score_rand,2)}')
Total time took: 43.12 minutes! El score del mejor estimador es: 0.8
grid_cv.best_params_
{'criterion': 'gini', 'max_depth': 6, 'min_samples_split': 15}
3.- Comparación de resultados
pd.concat(
[
pd.DataFrame(
data = list(rand_cv.best_params_.values()) + [round(score_rand, 3)],
index = list(rand_cv.best_params_.keys()) + ["Score"],
columns = ["RandomizedSearchCV"]
),
pd.DataFrame(
data = list(grid_cv.best_params_.values()) + [round(score_grid, 3)],
index = list(grid_cv.best_params_.keys()) + ["Score"],
columns = ["GridSearchCV"]
),
],
axis = 1
)
| RandomizedSearchCV | GridSearchCV | |
|---|---|---|
| min_samples_split | 15 | 15 |
| max_depth | 6 | 6 |
| criterion | entropy | gini |
| Score | 0.80 | 0.80 |
# Comparamos los tiempos de ejecución entre ambos
print(f'El Gridserach es {round((tt_grid/tt_rand), 2)} más lento que el RandomizedSearchCV')
El Gridserach es 5.2 más lento que el RandomizedSearchCV
4.- Se elije el mejor estimador
En función del score que ha sacado
final_pipe = rand_cv if score_rand >= score_grid else grid_cv
final_pipe.best_estimator_
DecisionTreeClassifier(max_depth=6, min_samples_split=15)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(max_depth=6, min_samples_split=15)